将被刮擦
完整代码
如果您不需要解释,请看一下the full code example in the online IDE
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const serchQuery = "pizza"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate
const searchParams = {
query: encodeURI(serchQuery),
location: encodeURI(location),
};
const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}`;
async function getFiltersFromPage(page) {
const priceAndDistance = await page.evaluate(() => {
return Array.from(document.querySelectorAll("aside[aria-labelledby='search-vertical-filter-panel-label'] > div > div")).reduce((result, el) => {
if (!el.querySelector(":scope > div > div:nth-child(2)")) {
return {
...result,
price: Array.from(el.querySelectorAll(":scope > div > div:nth-child(1) button")).map((el) => {
const text = el.querySelector("span").textContent;
return {
text,
value: `RestaurantsPriceRange2.${text.length}`,
};
}),
};
} else {
const filterTitle = el.querySelector(":scope > div > div:nth-child(1) p").textContent;
if (filterTitle === "Distance") {
return {
...result,
distance: Array.from(el.querySelectorAll(":scope > div > div:nth-child(2) label")).map((el) => ({
text: el.querySelector("span").textContent,
value: el.querySelector("input").value,
})),
};
} else return result;
}
}, {});
});
const filters = { ...priceAndDistance };
const seeAllButtons = await page.$$("aside[aria-labelledby='search-vertical-filter-panel-label'] > div > div a");
for (button of seeAllButtons) {
await button.click();
await page.waitForTimeout(2000);
const filterTitle = await page.evaluate(() =>
document.querySelector("#modal-portal-container div[aria-modal] div[role='presentation'] h4").textContent.split(" ")[1].toLowerCase()
);
filters[`${filterTitle}`] = await page.evaluate(() => {
return Array.from(document.querySelectorAll("#modal-portal-container div[aria-modal] div[role='presentation'] li")).map((el) => ({
text: el.querySelector("span").textContent,
value: el.querySelector("input").value,
}));
});
await page.click("#modal-portal-container div[aria-modal] div[role='presentation'] button[aria-label='Close']");
await page.waitForTimeout(2000);
}
return filters;
}
async function getFilters() {
const browser = await puppeteer.launch({
headless: false, // if you want to see what the browser is doing, you need to change this option to "false"
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
page.setViewport({ width: 1600, height: 800 });
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
const filters = await getFiltersFromPage(page);
await browser.close();
return filters;
}
getFilters().then((result) => console.dir(result, { depth: null }));
准备
首先,我们需要创建一个node.js* project并添加koude0包koude1,koude2和koude3以控制铬(或chrome或firefox,但现在我们仅在DevTools Protocol上使用铬在headless或无头模式中。
为此,在我们项目的目录中,打开命令行并输入:
$ npm init -y
,然后:
$ npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth
*如果您没有安装node.js,则可以download it from nodejs.org并遵循安装documentation。
ð注意:另外,您可以使用puppeteer
无需任何扩展即可,但是我强烈建议将其与puppeteer-extra
一起使用puppeteer-extra-plugin-stealth
,以防止您使用无头铬或正在使用web driver的网站检测。您可以在Chrome headless tests website上检查它。下面的屏幕截图显示了差异。
Process
我们需要从HTML元素中提取数据。通过SelectorGadget Chrome extension,获得合适的CSS选择器的过程非常容易,该过程能够通过单击浏览器中的所需元素来获取CSS选择器。但是,它并不总是完美地工作,尤其是当JavaScript大量使用该网站时。
如果您想了解更多有关它们的信息,我们在Serpapi上有专门的Web Scraping with CSS Selectors博客文章。
下面的GIF说明了使用Selectorgadget选择结果的不同部分的方法。
代码说明
声明koude1从puppeteer-extra
Library和koude9控制Chromium浏览器,以防止网站检测到您正在使用puppeteer-extra-plugin-stealth
库中使用web driver:
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
接下来,我们“说” puppeteer
使用StealthPlugin
,写我们要搜索的内容(serchQuery
常数),搜索位置,搜索URL并使用koude14方法进行搜索参数:
puppeteer.use(StealthPlugin());
const serchQuery = "pizza"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate
const searchParams = {
query: encodeURI(serchQuery),
location: encodeURI(location),
};
const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}`;
接下来,我们编写一个函数以从页面中获取过滤器:
async function getFiltersFromPage(page) {
...
}
然后,我们从页面上下文(使用koude15方法)获得价格和距离过滤器信息,然后将其保存在priceAndDistance
对象中:
const priceAndDistance = await page.evaluate(() => {
...
});
接下来,我们需要从所有"ul > li > div"
选择器(koude19)中制作并返回一个新数组(koude17方法),并使用koude20方法从数组中制作一个对象:
return Array.from(document.querySelectorAll("aside[aria-labelledby='search-vertical-filter-panel-label'] > div > div")).reduce((result, el) => {
...
}, {});
在reduce
方法中,我们需要检查是否不存在":scope > div > div:nth-child(2)"
选择器(使用koude23方法)我们返回price
过滤器(使用koude19方法和koude26属性)。
否则(else
语句),我们获得过滤类别标题,仅返回“距离”过滤器,因为其他过滤器被隐藏并仅显示其中的少数,我们稍后再得到:
if (!el.querySelector(":scope > div > div:nth-child(2)")) {
return {
...result,
price: Array.from(el.querySelectorAll(":scope > div > div:nth-child(1) button")).map((el) => {
const text = el.querySelector("span").textContent;
return {
text,
value: `RestaurantsPriceRange2.${text.length}`,
};
}),
};
} else {
const filterTitle = el.querySelector(":scope > div > div:nth-child(1) p").textContent;
if (filterTitle === "Distance") {
return {
...result,
distance: Array.from(el.querySelectorAll(":scope > div > div:nth-child(2) label")).map((el) => ({
text: el.querySelector("span").textContent,
value: el.querySelector("input").value,
})),
};
} else return result;
}
接下来,我们在filters
常数(使用koude30)中编写priceAndDistance
,并使用koude31方法获取来自其他过滤器类别的“查看所有”按钮:
const filters = { ...priceAndDistance };
const seeAllButtons = await page.$$("aside[aria-labelledby='search-vertical-filter-panel-label'] > div > div a");
接下来,我们需要在seeAllButtons
(koude33)上迭代,单击每个(koude34方法),等待2秒(使用koude35方法),获取过滤器类别标题,并将此标题中的页面中的过滤器添加到filters
对象。然后,我们单击“关闭”按钮(koude37方法),等待2秒钟,然后使用其他类别重复循环。
要从页面获取数据,我们使用下一个方法:
for (button of seeAllButtons) {
await button.click();
await page.waitForTimeout(2000);
const filterTitle = await page.evaluate(() =>
document.querySelector("#modal-portal-container div[aria-modal] div[role='presentation'] h4").textContent.split(" ")[1].toLowerCase()
);
filters[`${filterTitle}`] = await page.evaluate(() => {
return Array.from(document.querySelectorAll("#modal-portal-container div[aria-modal] div[role='presentation'] li")).map((el) => ({
text: el.querySelector("span").textContent,
value: el.querySelector("input").value,
}));
});
await page.click("#modal-portal-container div[aria-modal] div[role='presentation'] button[aria-label='Close']");
await page.waitForTimeout(2000);
}
接下来,编写一个函数来控制浏览器并获取信息:
async function getOrganicResults() {
...
}
首先,在此功能中,我们需要使用带有当前options
的puppeteer.launch({options})
方法来定义browser
,例如headless: true
和args: ["--no-sandbox", "--disable-setuid-sandbox"]
。
这些选项意味着我们将headless模式和数组与arguments一起使用,我们用来允许在线IDE中启动浏览器流程。然后我们打开一个新的page
并设置了页面视口分辨率(koude50方法)以显示过滤器面板:
const browser = await puppeteer.launch({
headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
page.setViewport({ width: 1600, height: 800 });
接下来,我们更改默认值(30 sec)等待选择器的时间到60000毫秒(1分钟)与koude51方法缓慢连接,请使用koude53方法访问URL
:
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
最后,我们从页面上获取过滤器,关闭浏览器,然后返回收到的数据:
const filters = await getFiltersFromPage(page);
await browser.close();
return filters;
现在我们可以启动我们的解析器:
$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file
输出
{
"price":[
{
"text":"$",
"value":"RestaurantsPriceRange2.1"
},
{
"text":"$$",
"value":"RestaurantsPriceRange2.2"
},
... and other items
],
"distance":[
{
"text":"Bird's-eye View",
"value":"g:-122.43782043457031,47.55614031294337,-122.23320007324219,47.69497434186282"
},
{
"text":"Driving (5 mi.)",
"value":"g:-122.38666534423828,47.590651847264034,-122.28435516357422,47.6600691664467"
},
... and other items
],
"categories":[
{
"text":"Restaurants",
"value":"restaurants"
},
{
"text":"Pizza",
"value":"pizza"
},
... and other items
],
"features":[
{
"text":"Reservations",
"value":"OnlineReservations"
},
{
"text":"Waitlist",
"value":"OnlineWaitlistReservation"
},
... and other items
],
"neighborhoods":[
{
"text":"Admiral",
"value":"WA:Seattle::Admiral"
},
{
"text":"Alki",
"value":"WA:Seattle::Alki"
},
... and other items
]
}
如何应用过滤器
您可以使用以下URL应用于Yelp搜索的过滤器,并在我们的Web scraping Yelp Organic Results with Nodejs和Web scraping Yelp Ads Results with Nodejs博客文章中的DIY解决方案部分中更改searchParams
常数:
const serchQuery = "pizza"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate
const priceAndFeaturesFilter "RestaurantsPriceRange2.1,OnlineReservations"; // for price and features filters
const categoryFilter "restaurants"; // for category filters
const locationFilter "g:-122.43782043457031,47.55614031294337,-122.23320007324219,47.69497434186282"; // for neighborhoods or distance filters (distance and neighborhoods filters can't be used together)
const searchParams = {
query: encodeURI(serchQuery),
location: encodeURI(location),
priceAndFeaturesFilter: encodeURI(priceAndFeaturesFilter),
categoryFilter: encodeURI(categoryFilter),
locationFilter: encodeURI(locationFilter),
};
const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}&attrs=${searchParams.priceAndFeaturesFilter}&cflt=${searchParams.categoryFilter}&l=${searchParams.locationFilter}`;
usuingaoqian47 from serpapi
本节是为了显示DIY解决方案与我们的解决方案之间的比较。
最大的区别是您不需要从头开始创建解析器并维护它。
也有可能在Google的某个时候阻止请求,我们在后端处理它,因此无需弄清楚如何自己做或弄清楚要使用哪个验证码,代理提供商。 p>
首先,我们需要安装koude55:
npm i google-search-results-nodejs
这是full code example,如果您不需要说明:
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com
const params = {
engine: "yelp", // search engine
device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
find_desc: "pizza", // Parameter defines the query you want to search
};
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
const getResults = async () => {
const json = await getJson();
return json.filters;
};
getResults().then((result) => console.dir(result, { depth: null }));
代码说明
首先,我们需要从koude55库中声明SerpApi
,并使用SerpApi的API键定义新的search
实例:
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);
接下来,我们为提出请求的必要参数编写:
const params = {
engine: "yelp", // search engine
device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
find_desc: "pizza", // Parameter defines the query you want to search
};
接下来,我们从Serpapi库中包装搜索方法,以便进一步处理搜索结果:
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
最后,我们声明了从页面获取数据并返回的函数getResult
:
const getResults = async () => {
...
};
在此功能中,我们获得了带有reuslts的json
,然后从接收到的json
返回filters
:
const json = await getJson();
return json.filters;
之后,我们运行getResults
函数并使用koude64方法在控制台中打印所有接收的信息,该方法允许您使用带有必要参数的对象来更改默认输出选项:
getResults().then((result) => console.dir(result, { depth: null }));
输出
{
"neighborhoods":{
"value":"p:WA:Seattle::",
"list":[
{
"text":"Waterfront",
"value":"Waterfront"
},
{
"text":"Fremont",
"value":"Fremont"
},
... and other items
]
},
"distance":[
{
"text":"Bird's-eye View",
"value":"g:-122.43782043457031,47.55614031294337,-122.23320007324219,47.69497434186282"
},
{
"text":"Driving (5 mi.)",
"value":"g:-122.38666534423828,47.590651847264034,-122.28435516357422,47.6600691664467"
},
... and other items
],
"price":[
{
"text":"$",
"value":"RestaurantsPriceRange2.1"
},
{
"text":"$$",
"value":"RestaurantsPriceRange2.2"
},
... and other items
],
"category":[
{
"text":"Cheesesteaks",
"value":"cheesesteaks"
},
{
"text":"Middle Eastern",
"value":"mideastern"
},
... and other items
],
"features":[
{
"text":"Waiter Service",
"value":"RestaurantsTableService"
},
{
"text":"Open to All",
"value":"BusinessOpenToAll"
},
... and other items
]
}
如何应用过滤器
您可以通过更改Web scraping Yelp Organic Results with Nodejs和Web scraping Yelp Ads Results with Nodejs博客文章的SERPAPI解决方案部分中的params
常数来应用于Yelp搜索的过滤器:
const params = {
engine: "yelp", // search engine
device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
find_desc: "pizza", // Parameter defines the query you want to search
cflt: "restaurants", // for category filters
attrs: "RestaurantsPriceRange2.1,OnlineReservations", // for price and features filters
l: "g:-122.43782043457031,47.55614031294337,-122.23320007324219,47.69497434186282", // for neighborhoods or distance filters (distance and neighborhoods filters can't be used together)
};
链接
如果您想在此博客文章中添加其他功能,或者您想查看Serpapi,write me a message的某些项目。
添加Feature Requestð«或Bugð