将被刮擦
完整代码
如果您不需要解释,请看一下the full code example in the online IDE
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const reviewsLimit = 100; // hardcoded limit for demonstration purpose
const searchParams = {
id: "8757849604759505625", // Parameter defines the ID of a product you want to get the results for
hl: "en", // Parameter defines the language to use for the Google search
gl: "us", // parameter defines the country to use for the Google search
};
const URL = `https://www.google.com/shopping/product/${searchParams.id}/reviews?hl=${searchParams.hl}&gl=${searchParams.gl}`;
async function getReviews(page) {
while (true) {
await page.waitForSelector("#sh-fp__pagination-button-wrapper");
const isNextPage = await page.$("#sh-fp__pagination-button-wrapper");
const reviews = await page.$$("#sh-rol__reviews-cont > div");
if (!isNextPage || reviews.length > reviewsLimit) break;
await page.click("#sh-fp__pagination-button-wrapper");
await page.waitForTimeout(3000);
}
return await page.evaluate(() => {
return {
productResults: {
title: document.querySelector(".BvQan")?.textContent.trim(),
reviews: parseInt(document.querySelector(".lBRvsb .HiT7Id > span")?.getAttribute("aria-label").replace(",", "")),
rating: parseFloat(document.querySelector(".lBRvsb .UzThIf")?.getAttribute("aria-label")),
},
reviewsResults: {
rating: Array.from(document.querySelectorAll(".aALHge")).map((el) => ({
stars: parseInt(el.querySelector(".rOdmxf")?.textContent),
amount: parseInt(el.querySelector(".vL3wxf")?.textContent),
})),
reviews: Array.from(document.querySelectorAll("#sh-rol__reviews-cont > div")).map((el) => ({
title: el.querySelector(".P3O8Ne")?.textContent.trim() || el.querySelector("._-iO")?.textContent.trim(),
date: el.querySelector(".OP1Nkd .ff3bE.nMkOOb")?.textContent.trim() || el.querySelector("._-iU")?.textContent.trim(),
rating: parseInt(el.querySelector(".UzThIf")?.getAttribute("aria-label") || el.querySelector("._-lq")?.getAttribute("aria-label")),
source: el.querySelector(".sPPcBf")?.textContent.trim() || el.querySelector("._-iP")?.textContent.trim(),
content: el.querySelector(".g1lvWe > div:last-child")?.textContent.trim() || el.querySelector("._-iN > div:last-child")?.textContent.trim(),
})),
},
};
});
}
async function getProductInfo() {
const browser = await puppeteer.launch({
headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector(".xt8sXe button");
const reviews = { productId: searchParams.id, ...(await getReviews(page)) };
await browser.close();
return reviews;
}
getProductInfo().then((result) => console.dir(result, { depth: null }));
准备
首先,我们需要创建一个node.js* project并添加koude0包koude1,koude2和koude3以控制铬(或chrome或firefox,但现在我们仅在DevTools Protocol上使用铬在headless或无头模式中。
为此,在我们项目的目录中,打开命令行并输入:
$ npm init -y
,然后:
$ npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth
*如果您没有安装node.js,则可以download it from nodejs.org并遵循安装documentation。
ð注意:另外,您可以使用puppeteer
无需任何扩展即可,但是我强烈建议将其与puppeteer-extra
一起使用puppeteer-extra-plugin-stealth
,以防止您使用无头铬或正在使用web driver的网站检测。您可以在Chrome headless tests website上检查它。下面的屏幕截图显示了差异。
Process
我们需要从HTML元素中提取数据。通过SelectorGadget Chrome extension,获得合适的CSS选择器的过程非常容易,该过程能够通过单击浏览器中的所需元素来获取CSS选择器。但是,它并不总是完美地工作,尤其是当JavaScript大量使用该网站时。
如果您想了解更多有关它们的信息,我们在Serpapi上有专门的Web Scraping with CSS Selectors博客文章。
下面的GIF说明了使用Selectorgadget选择结果的不同部分的方法。
代码说明
声明koude1从puppeteer-extra
Library和koude9控制Chromium浏览器,以防止网站检测到您正在使用puppeteer-extra-plugin-stealth
库中使用web driver:
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
接下来,我们“说” puppeteer
使用StealthPlugin
,编写必要的请求参数,搜索URL并设置我们要接收多少评论(reviewsLimit
常数):
puppeteer.use(StealthPlugin());
const reviewsLimit = 100; // hardcoded limit for demonstration purpose
const searchParams = {
id: "8757849604759505625", // Parameter defines the ID of a product you want to get the results for
hl: "en", // Parameter defines the language to use for the Google search
gl: "us", // parameter defines the country to use for the Google search
};
const URL =
`https://www.google.com/shopping/product/${searchParams.id}/reviews?hl=${searchParams.hl}&gl=${searchParams.gl}`;
接下来,我们编写一个函数以从页面获取产品信息:
async function getReviews(page) {
...
}
接下来,我们使用while
loop(koude14),其中我们检查了页面上的下一页按钮是否可用,评论的数量(koude16和koude17方法)少于reviewsLimit
我们单击(koude19方法)(koude19方法)下一页按钮元素,等待3秒(使用koude20方法),否则我们停止循环(使用koude21)。
while (true) {
await page.waitForSelector("#sh-fp__pagination-button-wrapper");
const isNextPage = await page.$("#sh-fp__pagination-button-wrapper");
const reviews = await page.$$("#sh-rol__reviews-cont > div");
if (!isNextPage || reviews.length > reviewsLimit) break;
await page.click("#sh-fp__pagination-button-wrapper");
await page.waitForTimeout(3000);
}
然后,我们从页面上下文(使用koude22方法)获取信息,然后将其保存在返回的对象中:
return await page.evaluate(() => ({
...
}));
接下来,我们需要使用下一个方法获取页面的不同部分:
productResults: {
title: document.querySelector(".BvQan")?.textContent.trim(),
reviews: parseInt(document.querySelector(".lBRvsb .HiT7Id > span")
?.getAttribute("aria-label").replace(",", "")),
rating: parseFloat(document.querySelector(".lBRvsb .UzThIf")
?.getAttribute("aria-label")),
},
reviewsResults: {
rating: Array.from(document.querySelectorAll(".aALHge")).map((el) => ({
stars: parseInt(el.querySelector(".rOdmxf")?.textContent),
amount: parseInt(el.querySelector(".vL3wxf")?.textContent),
})),
reviews: Array.from(document.querySelectorAll("#sh-rol__reviews-cont > div")).map((el) => ({
title: el.querySelector(".P3O8Ne")?.textContent.trim() || el.querySelector("._-iO")
?.textContent.trim(),
date:
el.querySelector(".OP1Nkd .ff3bE.nMkOOb")?.textContent.trim() ||
el.querySelector("._-iU")?.textContent.trim(),
rating:
parseInt(el.querySelector(".UzThIf")?.getAttribute("aria-label") ||
el.querySelector("._-lq")?.getAttribute("aria-label")),
source:
el.querySelector(".sPPcBf")?.textContent.trim() ||
el.querySelector("._-iP")?.textContent.trim(),
content:
el.querySelector(".g1lvWe > div:last-child")?.textContent.trim() ||
el.querySelector("._-iN > div:last-child")?.textContent.trim(),
})),
},
接下来,编写一个函数来控制浏览器并获取信息:
async function getProductInfo() {
...
}
首先,在此功能中,我们需要使用带有当前options
的puppeteer.launch({options})
方法来定义browser
,例如headless: true
和args: ["--no-sandbox", "--disable-setuid-sandbox"]
。
这些选项意味着我们将headless模式和数组与arguments一起使用,我们用来允许在线IDE中启动浏览器流程。然后我们打开一个新的page
:
const browser = await puppeteer.launch({
headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
接下来,我们更改默认值(30 sec)等待选择器的时间到60000毫秒(1分钟),以使用koude38方法进行慢速Internet连接,请使用koude40方法访问URL
,并使用koude41方法,以等待选择器加载:< br>
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector(".xt8sXe button");
最后,我们将产品数据从reviews
常数(使用koude43)中保存在页面中,关闭浏览器,然后返回接收到的数据:
const reviews = {
productId: searchParams.id,
...(await getReviews(page))
};
await browser.close();
return reviews;
现在我们可以启动我们的解析器:
$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file
输出
{
"productId":"8757849604759505625",
"productResults":{
"title":"Apple iPhone 14 Pro Max - 128 GB - Space Black - Unlocked",
"reviews":748,
"rating":4.5
},
"reviewsResults":{
"rating":[
{
"stars":5,
"amount":554
},
{
"stars":4,
"amount":58
},
{
"stars":3,
"amount":31
},
{
"stars":2,
"amount":32
},
{
"stars":1,
"amount":73
}
],
"reviews":[
{
"title":"13 Pro Max better in almost every way",
"date":"October 11, 2022",
"rating":2,
"source":"Cody LaRocque · Review provided by Google",
"content":"Great if you’re coming from an 11 or below. I upgraded from my 13 Pro Max as I do every year. Needless to say I am extremely underwhelmed and pretty dissatisfied with this years IPhone. My 13 pro max was better in almost every way, battery life being the major hit to me. No heavy gaming, streaming etc just daily text, call email etc. avg 4 hours per day screen-time. My 13 lasted almost a day and a half at this rate; my 14, I find myself needing a charge before I’m even off from my shift at work.The dynamic island is an over marketed, over hyped piece of useless software and does not function as cool as Apple made it appear. Always on display was fun for about 2 minutes setting it up, then immediately being turned always off because it’s way too bright and sucks power like you wouldn’t believe.All of apples key selling points are all the keys reasons I dislike this phone. Always on is a nightmare, battery life is a joke, dynamic island is useless, crash detection goes off on roller coasters, the cameras have very very little upside differences, the brightness of the screen only lasts for a couple seconds until it auto dims to conserve energy. I also feel like the overall build quality is lacking, I purchased the phone and the Apple leather case; this being the first time I’ve ever even used a case on my iPhone. My 13 lasted a year being dropped multiple times and didn’t even have a scratch. I dropped my 14 face down on a flat floor with the Apple leather case and it chipped the front corner. If you are coming from the 13. Don’t bother upgrading. If your coming from a 12 or below, consider upgrading to the now discounted 13 Less"
},
... and other reviews
]
}
}
usuingaoqian42 from serpapi
本节是为了显示DIY解决方案与我们的解决方案之间的比较。
最大的区别是您不需要从头开始创建解析器并维护它。
也有可能在Google的某个时候阻止请求,我们在后端处理它,因此无需弄清楚如何自己做或弄清楚要使用哪个验证码,代理提供商。 p>
首先,我们需要安装koude44:
npm i google-search-results-nodejs
这是full code example,如果您不需要说明:
require("dotenv").config();
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com
const reviewsLimit = 100; // hardcoded limit for demonstration purpose
const params = {
product_id: "8757849604759505625", // Parameter defines the ID of a product you want to get the results for.
engine: "google_product", // search engine
device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
hl: "en", // parameter defines the language to use for the Google search
gl: "us", // parameter defines the country to use for the Google search
reviews: true, // parameter for fetching reviews results
};
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
const getResults = async () => {
const json = await getJson();
const results = {};
results.productResults = json.product_results;
results.reviewsResult = [];
while (true) {
const json = await getJson();
if (json.reviews_results?.reviews) {
results.reviewsResult.push(...json.reviews_results.reviews);
params.start ? (params.start += 10) : (params.start = 10);
} else break;
if (results.reviewsResult.length > reviewsLimit) break;
}
return results;
};
getResults().then((result) => console.dir(result, { depth: null }));
代码说明
首先,我们需要从koude44库中声明SerpApi
并使用SerpApi的API键定义新的search
实例:
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);
接下来,我们编写了必要的参数来提出请求,并设置要接收多少评论(reviewsLimit
常数)::
const reviewsLimit = 100; // hardcoded limit for demonstration purpose
const params = {
product_id: "8757849604759505625", // Parameter defines the ID of a product you want to get the results for.
engine: "google_product", // search engine
device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
hl: "en", // parameter defines the language to use for the Google search
gl: "us", // parameter defines the country to use for the Google search
reviews: true, // parameter for fetching reviews results
};
接下来,我们从Serpapi库中包装搜索方法,以便进一步处理搜索结果:
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
最后,我们声明了从页面获取数据并返回的函数getResult
:
const getResults = async () => {
...
};
在此功能中,我们获得了带有结果的json
,将product_results
数据添加到results
对象的productResults
键并返回:
const json = await getJson();
const results = {};
results.productResults = json.product_results;
...
return results;
接下来,我们需要在结果对象中添加一个空的reviewsResult
数组,并使用while
循环(koude14)获取json
,添加每个页面中的reviews
结果,并设置下一页启动索引(to params.start
值)。如果页面上没有更多的reviews
结果,或者收到的评论数量超过reviewsLimit
,我们会停止循环(使用koude21):
results.reviewsResult = [];
while (true) {
const json = await getJson();
if (json.reviews_results?.reviews) {
results.reviewsResult.push(...json.reviews_results.reviews);
params.start ? (params.start += 10) : (params.start = 10);
} else break;
if (results.reviewsResult.length > reviewsLimit) break;
}
之后,我们运行getResults
函数,并使用koude64方法在控制台中打印所有接收的信息,该方法允许您使用带有必要参数的对象来更改默认输出选项:
getResults().then((result) => console.dir(result, { depth: null }));
输出
{
"productResults":{
"product_id":8757849604759506000,
"title":"Apple iPhone 14 Pro Max - 128 GB - Space Black - Unlocked",
"reviews":748,
"rating":4.3
},
"reviewsResult":[
{
"position":1,
"title":"13 Pro Max better in almost every way",
"date":"October 11, 2022",
"rating":2,
"source":"Cody LaRocque · Review provided by Google",
"content":"Great if you’re coming from an 11 or below. I upgraded from my 13 Pro Max as I do every year. Needless to say I am extremely underwhelmed and pretty dissatisfied with this years IPhone. My 13 pro max was better in almost every way, battery life being the major hit to me. \n""+""\n""+""No heavy gaming, streaming etc just daily text, call email etc. avg 4 hours per day screen-time. My 13 lasted almost a day and a half at this rate; my 14, I find myself needing a charge before I’m even off from my shift at work.\n""+""\n""+""The dynamic island is an over marketed, over hyped piece of useless software and does not function as cool as Apple made it appear. \n""+""\n""+""Always on display was fun for about 2 minutes setting it up, then immediately being turned always off because it’s way too bright and sucks power like you wouldn’t believe.\n""+""\n""+""All of apples key selling points are all the keys reasons I dislike this phone. Always on is a nightmare, battery life is a joke, dynamic island is useless, crash detection goes off on roller coasters, the cameras have very very little upside differences, the brightness of the screen only lasts for a couple seconds until it auto dims to conserve energy. \n""+""\n""+""I also feel like the overall build quality is lacking, I purchased the phone and the Apple leather case; this being the first time I’ve ever even used a case on my iPhone. My 13 lasted a year being dropped multiple times and didn’t even have a scratch. I dropped my 14 face down on a flat floor with the Apple leather case and it chipped the front corner. \n""+""\n""+""If you are coming from the 13. Don’t bother upgrading. If your coming from a 12 or below, consider upgrading to the now discounted 13 "
},
... and other reviews
]
}
链接
如果您想在此博客文章中添加其他功能,或者您想查看Serpapi,write me a message的某些项目。
添加Feature Requestð«或Bugð