将被刮擦
diy代码
如果您不需要解释,请看一下the full code example in the online IDE
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const serchQuery = "kfc"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate
const resultsLimit = 50; // hardcoded limit for demonstration purpose
const searchParams = {
query: encodeURI(serchQuery),
location: encodeURI(location),
};
const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}`;
async function getResultsFromPage(page) {
return await page.evaluate(() => {
let isAds = false;
const adsResults = [];
const organicResults = [];
Array.from(document.querySelectorAll("ul > li > div")).forEach((el) => {
const title = el.querySelector(":scope > h2")?.textContent;
const result = el.querySelector("[data-testid='serp-ia-card']");
if (title && title.includes("Sponsored")) isAds = true;
if (title && !title.includes("Sponsored")) isAds = false;
if (!title && result) {
const link = `https://www.yelp.com${result.querySelector("h3 a").getAttribute("href")}`;
const fullAddress = result.querySelector(`p > span:${isAds ? "last-child" : "nth-child(3)"}`)?.textContent.split(", ");
const thumbnails = result.querySelector(":scope > div > div:nth-child(1) a > img").getAttribute("srcset").split(", ");
const bestResolutionThumbnail = thumbnails[thumbnails.length - 1].split(" ")[0];
const categories = Array.from(result.querySelectorAll("p > span:nth-child(1) > a")).map((el) => {
if (el.textContent === "more") return null;
return {
title: el.textContent,
link: `https://www.yelp.com${el.getAttribute("href")}`,
};
});
const rating = result.querySelector("span > div[role='img']")?.getAttribute("aria-label").split(" ")?.[0];
if (isAds) {
adsResults.push({
title: result.querySelector("h3 a").textContent,
link,
reviewsLink: `${decodeURIComponent(link.slice(link.indexOf("redirect_url") + 13, link.indexOf("&request_id")))}#reviews`,
categories: categories.filter((el) => el),
rating,
reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
address: fullAddress?.[0],
neighborhoods: fullAddress?.[1],
snippet: (
result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
)?.textContent
.trim()
.slice(0, -6),
serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce(
(result, el) => {
return {
...result,
[`${el.querySelector("span > p")?.textContent}`]: el
.querySelector("div > span[role='img']")
.classList.contains("icon--16-checkmark-v2"),
};
},
{}
),
thumbnail: bestResolutionThumbnail,
});
} else {
organicResults.push({
title: result.querySelector("h3 a").textContent,
link,
reviewsLink: `${link}#reviews`,
categories: categories.filter((el) => el),
price: result.querySelector("p > span:nth-child(2)").textContent,
rating,
reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
address: fullAddress?.[0],
neighborhoods: fullAddress?.[1],
snippet: (
result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
)?.textContent
.trim()
.slice(0, -6),
serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce(
(result, el) => {
return {
...result,
[`${el.querySelector("span > p")?.textContent}`]: el
.querySelector("div > span[role='img']")
.classList.contains("icon--16-checkmark-v2"),
};
},
{}
),
thumbnail: bestResolutionThumbnail,
});
}
}
});
return { adsResults, organicResults };
});
}
async function getOrganicResults() {
const browser = await puppeteer.launch({
headless: false, // if you want to see what the browser is doing, you need to change this option to "false"
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
const adsResults = [];
const organicResults = [];
while (true) {
await page.waitForSelector("[data-testid='serp-ia-card']");
const resultsFromPage = await getResultsFromPage(page);
adsResults.push(...resultsFromPage.adsResults);
organicResults.push(...resultsFromPage.organicResults);
const isNextPage = await page.$("a[aria-label='Next']");
if (!isNextPage || adsResults.length + organicResults.length >= resultsLimit) break;
await page.click("a[aria-label='Next']");
await page.waitForTimeout(3000);
}
await browser.close();
return { adsResults, organicResults };
}
getOrganicResults().then((result) => console.dir(result, { depth: null }));
准备
首先,我们需要创建一个node.js* project并添加koude0包koude1,koude2和koude3以控制铬(或chrome或firefox,但现在我们仅在DevTools Protocol上使用铬在headless或无头模式中。
为此,在我们项目的目录中,打开命令行并输入:
$ npm init -y
,然后:
$ npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth
*如果您没有安装node.js,则可以download it from nodejs.org并遵循安装documentation。
ð注意:另外,您可以使用puppeteer
无需任何扩展即可,但是我强烈建议将其与puppeteer-extra
一起使用puppeteer-extra-plugin-stealth
,以防止您使用无头铬或正在使用web driver的网站检测。您可以在Chrome headless tests website上检查它。下面的屏幕截图显示了差异。
Process
我们需要从HTML元素中提取数据。通过SelectorGadget Chrome extension,获得合适的CSS选择器的过程非常容易,该过程能够通过单击浏览器中的所需元素来获取CSS选择器。但是,它并不总是完美地工作,尤其是当JavaScript大量使用该网站时。
如果您想了解更多有关它们的信息,我们在Serpapi上有专门的Web Scraping with CSS Selectors博客文章。
下面的GIF说明了使用Selectorgadget选择结果的不同部分的方法。
代码说明
声明koude1从puppeteer-extra
Library和koude9控制Chromium浏览器,以防止网站检测到您正在使用puppeteer-extra-plugin-stealth
库中使用web driver:
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
接下来,我们“说”对puppeteer
使用StealthPlugin
,写我们要搜索的内容(serchQuery
常数),搜索位置,设置要接收的结果(reviewsLimit
常数),搜索URL并使用koude15方法进行搜索参数:
puppeteer.use(StealthPlugin());
const serchQuery = "kfc"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate
const resultsLimit = 50; // hardcoded limit for demonstration purpose
const searchParams = {
query: encodeURI(serchQuery),
location: encodeURI(location),
};
const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}`;
接下来,我们编写一个函数以从页面中获取有机结果:
async function getResultsFromPage(page) {
...
}
然后,我们从页面上下文(使用koude16方法)获取信息,然后将其保存在返回的对象中:
return await page.evaluate(() => ({
...
}));
接下来,我们需要声明isAds
变量以不同于有机结果和ADS结果,然后我们声明adsResults
和organicResults
空阵列,然后通过所有"ul > li > div"
选择器(koude21方法)(koude20方法)进行和迭代(koude20方法)(koude20方法)( koude23):
let isAds = false;
const adsResults = [];
const organicResults = [];
Array.from(document.querySelectorAll("ul > li > div")).forEach((el) => {
...
});
因为结果元素标题和结果放在页面上相同的HTML元素中,我们需要获得标题,结果(使用koude24方法),并写一些条件以设置isAds
variabe true
或false
(使用koude285方法),如果当前元素,则不是title
,并且我们继续前进。
const title = el.querySelector(":scope > h2")?.textContent;
const result = el.querySelector("[data-testid='serp-ia-card']");
if (title && title.includes("Sponsored")) isAds = true;
if (title && !title.includes("Sponsored")) isAds = false;
if (!title && result) {
...
}
要使返回的结果对象我们需要定义link
,fullAddress
,categories
,rating
并在所有分辨率中获取thumbnails
。然后,我们获得了最后一个分辨率链接 - 这是最好的:
const link = `https://www.yelp.com${result.querySelector("h3 a").getAttribute("href")}`;
const fullAddress = result.querySelector(`p > span:${isAds ? "last-child" : "nth-child(3)"}`)?.textContent.split(", ");
const thumbnails = result.querySelector(":scope > div > div:nth-child(1) a > img").getAttribute("srcset").split(", ");
const bestResolutionThumbnail = thumbnails[thumbnails.length - 1].split(" ")[0];
const categories = Array.from(result.querySelectorAll("p > span:nth-child(1) > a")).map((el) => {
if (el.textContent === "more") return null;
return {
title: el.textContent,
link: `https://www.yelp.com${el.getAttribute("href")}`,
};
});
const rating = result.querySelector("span > div[role='img']")?.getAttribute("aria-label").split(" ")?.[0];
接下来,我们需要检查当前元素isAds
我们是否获得并添加(koude37方法)页面的不同部分到adsResults
数组,否则我们将结果添加到organicResults
。我们可以使用下一个方法获得结果:
if (isAds) {
adsResults.push({
title: result.querySelector("h3 a").textContent,
link,
reviewsLink: `${decodeURIComponent(link.slice(link.indexOf("redirect_url") + 13, link.indexOf("&request_id")))}#reviews`,
categories: categories.filter((el) => el),
rating,
reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
address: fullAddress?.[0],
neighborhoods: fullAddress?.[1],
snippet: (
result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
)?.textContent
.trim()
.slice(0, -6),
serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce((result, el) => {
return {
...result,
[`${el.querySelector("span > p")?.textContent}`]: el.querySelector("div > span[role='img']").classList.contains("icon--16-checkmark-v2"),
};
}, {}),
thumbnail: bestResolutionThumbnail,
});
} else {
organicResults.push({
title: result.querySelector("h3 a").textContent,
link,
reviewsLink: `${link}#reviews`,
categories: categories.filter((el) => el),
price: result.querySelector("p > span:nth-child(2)").textContent,
rating,
reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
address: fullAddress?.[0],
neighborhoods: fullAddress?.[1],
snippet: (
result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
)?.textContent
.trim()
.slice(0, -6),
serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce((result, el) => {
return {
...result,
[`${el.querySelector("span > p")?.textContent}`]: el.querySelector("div > span[role='img']").classList.contains("icon--16-checkmark-v2"),
};
}, {}),
thumbnail: bestResolutionThumbnail,
});
}
接下来,编写一个函数来控制浏览器并获取信息:
async function getOrganicResults() {
...
}
首先,在此功能中,我们需要使用带有当前options
的puppeteer.launch({options})
方法来定义browser
,例如headless: true
和args: ["--no-sandbox", "--disable-setuid-sandbox"]
。
这些选项意味着我们将headless模式和数组与arguments一起使用,我们用来允许在线IDE中启动浏览器流程。然后我们打开一个新的page
:
const browser = await puppeteer.launch({
headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
接下来,我们更改默认值(30 sec)等待选择器的时间到60000毫秒(1分钟)与koude55方法缓慢连接,请使用koude57方法访问URL
,并定义results
数组:
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
const results = [];
接下来,我们使用while
loop(koude59),在其中使用koude61方法等待直到选择器加载,将页面的结果添加到adsResults
和organicResults
array(使用koude64和koude64和koude65)在页面上(koude66方法)中存在,并且结果的数量少于resultsLimit
我们在下一页按钮元素上单击(koude68方法),等待3秒(使用koude69方法),否则我们停止循环(使用koude70)。 br>
while (true) {
await page.waitForSelector("[data-testid='serp-ia-card']");
const resultsFromPage = await getResultsFromPage(page);
adsResults.push(...resultsFromPage.adsResults);
organicResults.push(...resultsFromPage.organicResults);
const isNextPage = await page.$("a[aria-label='Next']");
if (!isNextPage || adsResults.length + organicResults.length >= resultsLimit) break;
await page.click("a[aria-label='Next']");
await page.waitForTimeout(3000);
}
最后,我们关闭浏览器,然后返回收到的数据:
await browser.close();
return { adsResults, organicResults };
现在我们可以启动我们的解析器:
$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file
输出
{
"adsResults": [
{
"title":"Pizza Hut",
"link":"https://www.yelp.com/adredir?ad_business_id=LuJTYRXHOuNBvmH2q_Pnhw&campaign_id=6a6Jneapwf8y0J5-2OI0UQ&click_origin=search_results&placement=above_search&placement_slot=0&redirect_url=https%3A%2F%2Fwww.yelp.com%2Fbiz%2Fpizza-hut-seattle-5&request_id=ed15d90f0b858297&signature=9a5b8ff57fd33b93fa11a7ccd12cfbf4268e7325970c0ac43238160bf2fe4f50&slot=0",
"reviewsLink":"https://www.yelp.com/biz/pizza-hut-seattle-5#reviews",
"categories":[
{
"title":"Chicken Wings",
"link":"https://www.yelp.com/search?cflt=chicken_wings&find_loc=Seattle%2C+WA"
},
{
"title":"Pizza",
"link":"https://www.yelp.com/search?cflt=pizza&find_loc=Seattle%2C+WA"
},
{
"title":"Fast Food",
"link":"https://www.yelp.com/search?cflt=hotdogs&find_loc=Seattle%2C+WA"
}
],
"address":"",
"snippet":"“We ordered a pizza and wings for carry out tonight with special instructions for how to cook the wings. When my husband picked up the order, the wings weren't right. I want to give…",
"serviceOptions":{
"Delivery":true,
"Takeout":true
},
"thumbnail":"https://s3-media0.fl.yelpcdn.com/bphoto/c-BNJn-PnEQedrtt4NPvYw/1000s.jpg"
},
...and other results
],
"organicResults": [
{
"title":"KFC",
"link":"https://www.yelp.com/biz/kfc-seattle-18?osq=kfc",
"reviewsLink":"https://www.yelp.com/biz/kfc-seattle-18?osq=kfc#reviews",
"categories":[
{
"title":"Fast Food",
"link":"https://www.yelp.com/search?cflt=hotdogs&find_loc=Seattle%2C+WA"
},
{
"title":"Chicken Wings",
"link":"https://www.yelp.com/search?cflt=chicken_wings&find_loc=Seattle%2C+WA"
},
{
"title":"Chicken Shop",
"link":"https://www.yelp.com/search?cflt=chickenshop&find_loc=Seattle%2C+WA"
},
{
"title":"more",
"link":"https://www.yelp.com/biz/kfc-seattle-18?hrid=wRKhf8md_ru2OgAz1mrpRg&osq=kfc"
}
],
"price":"$",
"rating":"2",
"reviews":"54",
"address":"KFC - Taco Bell",
"neighborhoods":"Lower Queen Anne",
"snippet":"“I have tried KFC much in India and it was first time in WA, Usa. It was good taste however not as good as Indian taste of KFC.",
"serviceOptions":{
"Delivery":true,
"Takeout":true,
"Curbside Pickup":true
},
"thumbnail":"https://s3-media0.fl.yelpcdn.com/bphoto/jrHdimlo2BO8wL49HXNDcQ/1000s.jpg"
},
...and other results
]
}
使用yelp Organic和Ads结果来自serpapi
本节是为了显示DIY解决方案与我们的解决方案之间的比较。
最大的区别是您不需要从头开始创建解析器并维护它。
也有可能在Google的某个时候阻止请求,我们在后端处理它,因此无需弄清楚如何自己做或弄清楚要使用哪个验证码,代理提供商。 p>
首先,我们需要安装koude71:
npm i google-search-results-nodejs
这是full code example,如果您不需要解释:
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com
const resultsLimit = 50; // hardcoded limit for demonstration purpose
const params = {
engine: "yelp", // search engine
device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
find_desc: "kfc", // Parameter defines the query you want to search
};
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
const getResults = async () => {
const adsResults = [];
const organicResults = [];
while (true) {
const json = await getJson();
if (json.organic_results) {
if (json.ads_results) {
adsResults.push(...json.ads_results);
}
organicResults.push(...json.organic_results);
params.start ? (params.start += 10) : (params.start = 10);
} else break;
if (adsResults.length + organicResults.length >= resultsLimit) break;
}
return { adsResults, organicResults };
};
getResults().then((result) => console.dir(result, { depth: null }));
代码说明
首先,我们需要从koude71库中声明SerpApi
,并使用SerpApi的API键定义新的search
实例:
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);
接下来,我们编写用于提出请求的必要参数,并设置要接收多少结果(resultsLimit
常数):
const resultsLimit = 50; // hardcoded limit for demonstration purpose
const params = {
engine: "yelp", // search engine
device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
find_desc: "kfc", // Parameter defines the query you want to search
};
接下来,我们从Serpapi库中包装搜索方法,以便进一步处理搜索结果:
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
最后,我们声明了从页面获取数据并返回的函数getResult
:
const getResults = async () => {
...
};
在此功能中,我们需要声明空的adsResults
和organicResults
数组,并使用koude59循环获取json
,添加ads_results
和organic_results
从每个页面中添加结果,并设置下一页启动索引(to params.start
值)。如果页面上没有更多结果,或者收到的结果的数量更多,请temphoude67我们停止循环(使用koude70),并返回对象,结果:
const adsResults = [];
const organicResults = [];
while (true) {
const json = await getJson();
if (json.organic_results) {
if (json.ads_results) {
adsResults.push(...json.ads_results);
}
organicResults.push(...json.organic_results);
params.start ? (params.start += 10) : (params.start = 10);
} else break;
if (adsResults.length + organicResults.length >= resultsLimit) break;
}
return { adsResults, organicResults };
之后,我们运行getResults
函数并使用koude87方法在控制台中打印所有接收的信息,该方法允许您使用带有必要参数的对象来更改默认输出选项:
getResults().then((result) => console.dir(result, { depth: null }));
输出
{
"adsResults": [
{
"block_position":"top",
"place_ids":[
"ThGZdWIyNOXUeTqMWRmVlw",
"dudez-woodfired-pizza-seattle"
],
"title":"DUDE’Z woodfired pizza",
"link":"https://www.yelp.com/adredir?ad_business_id=ThGZdWIyNOXUeTqMWRmVlw&campaign_id=KjSNa2u5Q-4tz8JKZAiYvg&click_origin=search_results&placement=above_search&placement_slot=0&redirect_url=https%3A%2F%2Fwww.yelp.com%2Fbiz%2Fdudez-woodfired-pizza-seattle&request_id=e3869f3c027b5193&signature=bbf93e3aaaae7762d2435d05e5fefee95c31e33c01f9b2b8aad85c660d7d5cfc&slot=0",
"reviews_link":"https://serpapi.com/search.json?engine=yelp_reviews&place_id=ThGZdWIyNOXUeTqMWRmVlw",
"categories":[
{
"title":"Pizza",
"link":"https://www.yelp.com/search?cflt=pizza&find_loc=Seattle%2C+WA"
}
],
"rating":5,
"reviews":1,
"neighborhoods":"Cottage Grove",
"phone":"(360) 803-1616",
"snippet":"These pizzas are so delicious! The guys really take care of their customers. We ordered the Carne Asada Pizza and the awesome G Pop pizza (think jalapeño poppers in the shape of a…",
"service_options":{
"outdoor_seating":false,
"delivery":true,
"takeout":true
},
"thumbnail":"https://s3-media0.fl.yelpcdn.com/bphoto/cJynIXUZp0OWhSdW3AUoaw/348s.jpg"
},
...and other results
],
"organicResults":
[
{
"position": 1,
"place_ids": ["UON0MxZGG0cgsU5LYPjJbg", "kfc-seattle-18"],
"title": "KFC",
"link": "https://www.yelp.com/biz/kfc-seattle-18?osq=kfc",
"reviews_link": "https://serpapi.com/search.json?engine=yelp_reviews&place_id=UON0MxZGG0cgsU5LYPjJbg",
"categories": [
{
"title": "Fast Food",
"link": "https://www.yelp.com/search?cflt=hotdogs&find_loc=Seattle%2C+WA"
},
{
"title": "Chicken Wings",
"link": "https://www.yelp.com/search?cflt=chicken_wings&find_loc=Seattle%2C+WA"
},
{
"title": "Chicken Shop",
"link": "https://www.yelp.com/search?cflt=chickenshop&find_loc=Seattle%2C+WA"
}
],
"price": "$",
"rating": 2,
"reviews": 54,
"address": "210 W Mercer St",
"neighborhoods": "Lower Queen Anne",
"phone": "(206) 283-7575",
"snippet": "I have tried KFC much in India and it was first time in WA, Usa. It was good taste however not as good as Indian taste of KFC.",
"service_options": {
"delivery": true,
"takeout": true,
"curbside_pickup": true
},
"thumbnail": "https://s3-media0.fl.yelpcdn.com/bphoto/jrHdimlo2BO8wL49HXNDcQ/348s.jpg"
},
...and other results
]
}
链接
如果您想在此博客文章中添加其他功能,或者您想查看Serpapi,write me a message的某些项目。
添加一个Feature Requestð«或Bugð