网络用nodejs刮擦Yelp过滤器
#node #webscraping #serpapi

将被刮擦

what

完整代码

如果您不需要解释,请看一下the full code example in the online IDE

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

const serchQuery = "pizza"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate

const searchParams = {
  query: encodeURI(serchQuery),
  location: encodeURI(location),
};

const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}`;

async function getFiltersFromPage(page) {
  const priceAndDistance = await page.evaluate(() => {
    return Array.from(document.querySelectorAll("aside[aria-labelledby='search-vertical-filter-panel-label'] > div > div")).reduce((result, el) => {
      if (!el.querySelector(":scope > div > div:nth-child(2)")) {
        return {
          ...result,
          price: Array.from(el.querySelectorAll(":scope > div > div:nth-child(1) button")).map((el) => {
            const text = el.querySelector("span").textContent;
            return {
              text,
              value: `RestaurantsPriceRange2.${text.length}`,
            };
          }),
        };
      } else {
        const filterTitle = el.querySelector(":scope > div > div:nth-child(1) p").textContent;
        if (filterTitle === "Distance") {
          return {
            ...result,
            distance: Array.from(el.querySelectorAll(":scope > div > div:nth-child(2) label")).map((el) => ({
              text: el.querySelector("span").textContent,
              value: el.querySelector("input").value,
            })),
          };
        } else return result;
      }
    }, {});
  });
  const filters = { ...priceAndDistance };
  const seeAllButtons = await page.$$("aside[aria-labelledby='search-vertical-filter-panel-label'] > div > div a");
  for (button of seeAllButtons) {
    await button.click();
    await page.waitForTimeout(2000);
    const filterTitle = await page.evaluate(() =>
      document.querySelector("#modal-portal-container div[aria-modal] div[role='presentation'] h4").textContent.split(" ")[1].toLowerCase()
    );
    filters[`${filterTitle}`] = await page.evaluate(() => {
      return Array.from(document.querySelectorAll("#modal-portal-container div[aria-modal] div[role='presentation'] li")).map((el) => ({
        text: el.querySelector("span").textContent,
        value: el.querySelector("input").value,
      }));
    });
    await page.click("#modal-portal-container div[aria-modal] div[role='presentation'] button[aria-label='Close']");
    await page.waitForTimeout(2000);
  }
  return filters;
}

async function getFilters() {
  const browser = await puppeteer.launch({
    headless: false, // if you want to see what the browser is doing, you need to change this option to "false"
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();
  page.setViewport({ width: 1600, height: 800 });
  await page.setDefaultNavigationTimeout(60000);
  await page.goto(URL);

  const filters = await getFiltersFromPage(page);

  await browser.close();

  return filters;
}

getFilters().then((result) => console.dir(result, { depth: null }));

准备

首先,我们需要创建一个node.js* project并添加koude0koude1koude2koude3以控制铬(或chrome或firefox,但现在我们仅在DevTools Protocol上使用铬在headless或无头模式中。

为此,在我们项目的目录中,打开命令行并输入:

$ npm init -y

,然后:

$ npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth

*如果您没有安装node.js,则可以download it from nodejs.org并遵循安装documentation

ð注意:另外,您可以使用puppeteer无需任何扩展即可,但是我强烈建议将其与puppeteer-extra一起使用puppeteer-extra-plugin-stealth,以防止您使用无头铬或正在使用web driver的网站检测。您可以在Chrome headless tests website上检查它。下面的屏幕截图显示了差异。

stealth

Process

我们需要从HTML元素中提取数据。通过SelectorGadget Chrome extension,获得合适的CSS选择器的过程非常容易,该过程能够通过单击浏览器中的所需元素来获取CSS选择器。但是,它并不总是完美地工作,尤其是当JavaScript大量使用该网站时。

如果您想了解更多有关它们的信息,我们在Serpapi上有专门的Web Scraping with CSS Selectors博客文章。

下面的GIF说明了使用Selectorgadget选择结果的不同部分的方法。

how

代码说明

声明koude1puppeteer-extra Library和koude9控制Chromium浏览器,以防止网站检测到您正在使用puppeteer-extra-plugin-stealth库中使用web driver

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

接下来,我们“说” puppeteer使用StealthPlugin,写我们要搜索的内容(serchQuery常数),搜索位置,搜索URL并使用koude14方法进行搜索参数:

puppeteer.use(StealthPlugin());

const serchQuery = "pizza"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate

const searchParams = {
  query: encodeURI(serchQuery),
  location: encodeURI(location),
};

const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}`;

接下来,我们编写一个函数以从页面中获取过滤器:

async function getFiltersFromPage(page) {
  ...
}

然后,我们从页面上下文(使用koude15方法)获得价格和距离过滤器信息,然后将其保存在priceAndDistance对象中:

const priceAndDistance = await page.evaluate(() => {
    ...
});

接下来,我们需要从所有"ul > li > div"选择器(koude19)中制作并返回一个新数组(koude17方法),并使用koude20方法从数组中制作一个对象:

return Array.from(document.querySelectorAll("aside[aria-labelledby='search-vertical-filter-panel-label'] > div > div")).reduce((result, el) => {
    ...
}, {});

reduce方法中,我们需要检查是否不存在":scope > div > div:nth-child(2)"选择器(使用koude23方法)我们返回price过滤器(使用koude19方法和koude26属性)。

)。

否则(else语句),我们获得过滤类别标题,仅返回“距离”过滤器,因为其他过滤器被隐藏并仅显示其中的少数,我们稍后再得到:

if (!el.querySelector(":scope > div > div:nth-child(2)")) {
  return {
    ...result,
    price: Array.from(el.querySelectorAll(":scope > div > div:nth-child(1) button")).map((el) => {
      const text = el.querySelector("span").textContent;
      return {
        text,
        value: `RestaurantsPriceRange2.${text.length}`,
      };
    }),
  };
} else {
  const filterTitle = el.querySelector(":scope > div > div:nth-child(1) p").textContent;
  if (filterTitle === "Distance") {
    return {
      ...result,
      distance: Array.from(el.querySelectorAll(":scope > div > div:nth-child(2) label")).map((el) => ({
        text: el.querySelector("span").textContent,
        value: el.querySelector("input").value,
      })),
    };
  } else return result;
}

接下来,我们在filters常数(使用koude30)中编写priceAndDistance,并使用koude31方法获取来自其他过滤器类别的“查看所有”按钮:

const filters = { ...priceAndDistance };
const seeAllButtons = await page.$$("aside[aria-labelledby='search-vertical-filter-panel-label'] > div > div a");

接下来,我们需要在seeAllButtonskoude33)上迭代,单击每个(koude34方法),等待2秒(使用koude35方法),获取过滤器类别标题,并将此标题中的页面中的过滤器添加到filters对象。然后,我们单击“关闭”按钮(koude37方法),等待2秒钟,然后使用其他类别重复循环。

要从页面获取数据,我们使用下一个方法:

for (button of seeAllButtons) {
  await button.click();
  await page.waitForTimeout(2000);
  const filterTitle = await page.evaluate(() =>
    document.querySelector("#modal-portal-container div[aria-modal] div[role='presentation'] h4").textContent.split(" ")[1].toLowerCase()
  );
  filters[`${filterTitle}`] = await page.evaluate(() => {
    return Array.from(document.querySelectorAll("#modal-portal-container div[aria-modal] div[role='presentation'] li")).map((el) => ({
      text: el.querySelector("span").textContent,
      value: el.querySelector("input").value,
    }));
  });
  await page.click("#modal-portal-container div[aria-modal] div[role='presentation'] button[aria-label='Close']");
  await page.waitForTimeout(2000);
}

接下来,编写一个函数来控制浏览器并获取信息:

async function getOrganicResults() {
  ...
}

首先,在此功能中,我们需要使用带有当前optionspuppeteer.launch({options})方法来定义browser,例如headless: trueargs: ["--no-sandbox", "--disable-setuid-sandbox"]

这些选项意味着我们将headless模式和数组与arguments一起使用,我们用来允许在线IDE中启动浏览器流程。然后我们打开一个新的page并设置了页面视口分辨率(koude50方法)以显示过滤器面板:

const browser = await puppeteer.launch({
  headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
  args: ["--no-sandbox", "--disable-setuid-sandbox"],
});

const page = await browser.newPage();
page.setViewport({ width: 1600, height: 800 });

接下来,我们更改默认值(30 sec)等待选择器的时间到60000毫秒(1分钟)与koude51方法缓慢连接,请使用koude53方法访问URL

await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);

最后,我们从页面上获取过滤器,关闭浏览器,然后返回收到的数据:

const filters = await getFiltersFromPage(page);

await browser.close();

return filters;

现在我们可以启动我们的解析器:

$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file

输出

{
   "price":[
      {
         "text":"$",
         "value":"RestaurantsPriceRange2.1"
      },
      {
         "text":"$$",
         "value":"RestaurantsPriceRange2.2"
      },
        ... and other items
   ],
   "distance":[
      {
         "text":"Bird's-eye View",
         "value":"g:-122.43782043457031,47.55614031294337,-122.23320007324219,47.69497434186282"
      },
      {
         "text":"Driving (5 mi.)",
         "value":"g:-122.38666534423828,47.590651847264034,-122.28435516357422,47.6600691664467"
      },
        ... and other items
   ],
   "categories":[
      {
         "text":"Restaurants",
         "value":"restaurants"
      },
      {
         "text":"Pizza",
         "value":"pizza"
      },
      ... and other items
   ],
   "features":[
      {
         "text":"Reservations",
         "value":"OnlineReservations"
      },
      {
         "text":"Waitlist",
         "value":"OnlineWaitlistReservation"
      },
     ... and other items
   ],
   "neighborhoods":[
      {
         "text":"Admiral",
         "value":"WA:Seattle::Admiral"
      },
      {
         "text":"Alki",
         "value":"WA:Seattle::Alki"
      },
    ... and other items
   ]
}

如何应用过滤器

您可以使用以下URL应用于Yelp搜索的过滤器,并在我们的Web scraping Yelp Organic Results with NodejsWeb scraping Yelp Ads Results with Nodejs博客文章中的DIY解决方案部分中更改searchParams常数:

const serchQuery = "pizza"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate
const priceAndFeaturesFilter "RestaurantsPriceRange2.1,OnlineReservations"; // for price and features filters
const categoryFilter "restaurants"; // for category filters
const locationFilter "g:-122.43782043457031,47.55614031294337,-122.23320007324219,47.69497434186282"; // for neighborhoods or distance filters (distance and neighborhoods filters can't be used together)


const searchParams = {
  query: encodeURI(serchQuery),
  location: encodeURI(location),
  priceAndFeaturesFilter: encodeURI(priceAndFeaturesFilter),
  categoryFilter: encodeURI(categoryFilter),
  locationFilter: encodeURI(locationFilter),
};

const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}&attrs=${searchParams.priceAndFeaturesFilter}&cflt=${searchParams.categoryFilter}&l=${searchParams.locationFilter}`;

usuingaoqian47 from serpapi

本节是为了显示DIY解决方案与我们的解决方案之间的比较。

最大的区别是您不需要从头开始创建解析器并维护它。

也有可能在Google的某个时候阻止请求,我们在后端处理它,因此无需弄清楚如何自己做或弄清楚要使用哪个验证码,代理提供商。

首先,我们需要安装koude55

npm i google-search-results-nodejs

这是full code example,如果您不需要说明:

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com

const params = {
  engine: "yelp", // search engine
  device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
  find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
  find_desc: "pizza", // Parameter defines the query you want to search
};

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

const getResults = async () => {
  const json = await getJson();
  return json.filters;
};

getResults().then((result) => console.dir(result, { depth: null }));

代码说明

首先,我们需要从koude55库中声明SerpApi,并使用SerpApi的API键定义新的search实例:

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);

接下来,我们为提出请求的必要参数编写:

const params = {
  engine: "yelp", // search engine
  device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
  find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
  find_desc: "pizza", // Parameter defines the query you want to search
};

接下来,我们从Serpapi库中包装搜索方法,以便进一步处理搜索结果:

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

最后,我们声明了从页面获取数据并返回的函数getResult

const getResults = async () => {
  ...
};

在此功能中,我们获得了带有reuslts的json,然后从接收到的json返回filters

const json = await getJson();
return json.filters;

之后,我们运行getResults函数并使用koude64方法在控制台中打印所有接收的信息,该方法允许您使用带有必要参数的对象来更改默认输出选项:

getResults().then((result) => console.dir(result, { depth: null }));

输出

{
   "neighborhoods":{
      "value":"p:WA:Seattle::",
      "list":[
         {
            "text":"Waterfront",
            "value":"Waterfront"
         },
         {
            "text":"Fremont",
            "value":"Fremont"
         },
        ... and other items
      ]
   },
   "distance":[
      {
         "text":"Bird's-eye View",
         "value":"g:-122.43782043457031,47.55614031294337,-122.23320007324219,47.69497434186282"
      },
      {
         "text":"Driving (5 mi.)",
         "value":"g:-122.38666534423828,47.590651847264034,-122.28435516357422,47.6600691664467"
      },
        ... and other items
   ],
   "price":[
      {
         "text":"$",
         "value":"RestaurantsPriceRange2.1"
      },
      {
         "text":"$$",
         "value":"RestaurantsPriceRange2.2"
      },
        ... and other items
   ],
   "category":[
      {
         "text":"Cheesesteaks",
         "value":"cheesesteaks"
      },
      {
         "text":"Middle Eastern",
         "value":"mideastern"
      },
      ... and other items
   ],
   "features":[
      {
         "text":"Waiter Service",
         "value":"RestaurantsTableService"
      },
      {
         "text":"Open to All",
         "value":"BusinessOpenToAll"
      },
      ... and other items
   ]
}

如何应用过滤器

您可以通过更改Web scraping Yelp Organic Results with NodejsWeb scraping Yelp Ads Results with Nodejs博客文章的SERPAPI解决方案部分中的params常数来应用于Yelp搜索的过滤器:

const params = {
  engine: "yelp", // search engine
  device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
  find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
  find_desc: "pizza", // Parameter defines the query you want to search
  cflt: "restaurants", // for category filters
  attrs: "RestaurantsPriceRange2.1,OnlineReservations", // for price and features filters
  l: "g:-122.43782043457031,47.55614031294337,-122.23320007324219,47.69497434186282", // for neighborhoods or distance filters (distance and neighborhoods filters can't be used together)
};

链接

如果您想在此博客文章中添加其他功能,或者您想查看Serpapi,write me a message的某些项目。


加入我们的Twitter | YouTube

添加Feature Requestð«或Bugð