网络刮擦Yelp有机和广告结果Nodejs-DEV365 开发者社区

将被刮擦

diy代码

如果您不需要解释，请看一下the full code example in the online IDE

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

const serchQuery = "kfc"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate
const resultsLimit = 50; // hardcoded limit for demonstration purpose

const searchParams = {
  query: encodeURI(serchQuery),
  location: encodeURI(location),
};

const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}`;

async function getResultsFromPage(page) {
  return await page.evaluate(() => {
    let isAds = false;
    const adsResults = [];
    const organicResults = [];
    Array.from(document.querySelectorAll("ul > li > div")).forEach((el) => {
      const title = el.querySelector(":scope > h2")?.textContent;
      const result = el.querySelector("[data-testid='serp-ia-card']");
      if (title && title.includes("Sponsored")) isAds = true;
      if (title && !title.includes("Sponsored")) isAds = false;
      if (!title && result) {
        const link = `https://www.yelp.com${result.querySelector("h3 a").getAttribute("href")}`;
        const fullAddress = result.querySelector(`p > span:${isAds ? "last-child" : "nth-child(3)"}`)?.textContent.split(", ");
        const thumbnails = result.querySelector(":scope > div > div:nth-child(1) a > img").getAttribute("srcset").split(", ");
        const bestResolutionThumbnail = thumbnails[thumbnails.length - 1].split(" ")[0];
        const categories = Array.from(result.querySelectorAll("p > span:nth-child(1) > a")).map((el) => {
          if (el.textContent === "more") return null;
          return {
            title: el.textContent,
            link: `https://www.yelp.com${el.getAttribute("href")}`,
          };
        });
        const rating = result.querySelector("span > div[role='img']")?.getAttribute("aria-label").split(" ")?.[0];
        if (isAds) {
          adsResults.push({
            title: result.querySelector("h3 a").textContent,
            link,
            reviewsLink: `${decodeURIComponent(link.slice(link.indexOf("redirect_url") + 13, link.indexOf("&request_id")))}#reviews`,
            categories: categories.filter((el) => el),
            rating,
            reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
            address: fullAddress?.[0],
            neighborhoods: fullAddress?.[1],
            snippet: (
              result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
              result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
            )?.textContent
              .trim()
              .slice(0, -6),
            serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce(
              (result, el) => {
                return {
                  ...result,
                  [`${el.querySelector("span > p")?.textContent}`]: el
                    .querySelector("div > span[role='img']")
                    .classList.contains("icon--16-checkmark-v2"),
                };
              },
              {}
            ),
            thumbnail: bestResolutionThumbnail,
          });
        } else {
          organicResults.push({
            title: result.querySelector("h3 a").textContent,
            link,
            reviewsLink: `${link}#reviews`,
            categories: categories.filter((el) => el),
            price: result.querySelector("p > span:nth-child(2)").textContent,
            rating,
            reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
            address: fullAddress?.[0],
            neighborhoods: fullAddress?.[1],
            snippet: (
              result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
              result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
            )?.textContent
              .trim()
              .slice(0, -6),
            serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce(
              (result, el) => {
                return {
                  ...result,
                  [`${el.querySelector("span > p")?.textContent}`]: el
                    .querySelector("div > span[role='img']")
                    .classList.contains("icon--16-checkmark-v2"),
                };
              },
              {}
            ),
            thumbnail: bestResolutionThumbnail,
          });
        }
      }
    });
    return { adsResults, organicResults };
  });
}

async function getOrganicResults() {
  const browser = await puppeteer.launch({
    headless: false, // if you want to see what the browser is doing, you need to change this option to "false"
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(URL);

  const adsResults = [];
  const organicResults = [];

  while (true) {
    await page.waitForSelector("[data-testid='serp-ia-card']");
    const resultsFromPage = await getResultsFromPage(page);
    adsResults.push(...resultsFromPage.adsResults);
    organicResults.push(...resultsFromPage.organicResults);
    const isNextPage = await page.$("a[aria-label='Next']");
    if (!isNextPage || adsResults.length + organicResults.length >= resultsLimit) break;
    await page.click("a[aria-label='Next']");
    await page.waitForTimeout(3000);
  }

  await browser.close();

  return { adsResults, organicResults };
}

getOrganicResults().then((result) => console.dir(result, { depth: null }));

准备

首先，我们需要创建一个node.js* project并添加koude0包koude1，koude2和koude3以控制铬（或chrome或firefox，但现在我们仅在DevTools Protocol上使用铬在headless或无头模式中。

为此，在我们项目的目录中，打开命令行并输入：

$ npm init -y

，然后：

$ npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth

*如果您没有安装node.js，则可以download it from nodejs.org并遵循安装documentation。

ð注意：另外，您可以使用puppeteer无需任何扩展即可，但是我强烈建议将其与puppeteer-extra一起使用puppeteer-extra-plugin-stealth，以防止您使用无头铬或正在使用web driver的网站检测。您可以在Chrome headless tests website上检查它。下面的屏幕截图显示了差异。

Process

我们需要从HTML元素中提取数据。通过SelectorGadget Chrome extension，获得合适的CSS选择器的过程非常容易，该过程能够通过单击浏览器中的所需元素来获取CSS选择器。但是，它并不总是完美地工作，尤其是当JavaScript大量使用该网站时。

如果您想了解更多有关它们的信息，我们在Serpapi上有专门的Web Scraping with CSS Selectors博客文章。

下面的GIF说明了使用Selectorgadget选择结果的不同部分的方法。

代码说明

声明koude1从puppeteer-extra Library和koude9控制Chromium浏览器，以防止网站检测到您正在使用puppeteer-extra-plugin-stealth库中使用web driver：

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

接下来，我们“说”对puppeteer使用StealthPlugin，写我们要搜索的内容（serchQuery常数），搜索位置，设置要接收的结果（reviewsLimit常数），搜索URL并使用koude15方法进行搜索参数：

puppeteer.use(StealthPlugin());

const serchQuery = "kfc"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate
const resultsLimit = 50; // hardcoded limit for demonstration purpose

const searchParams = {
  query: encodeURI(serchQuery),
  location: encodeURI(location),
};

const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}`;

接下来，我们编写一个函数以从页面中获取有机结果：

async function getResultsFromPage(page) {
  ...
}

然后，我们从页面上下文（使用koude16方法）获取信息，然后将其保存在返回的对象中：

return await page.evaluate(() => ({
    ...
}));

接下来，我们需要声明isAds变量以不同于有机结果和ADS结果，然后我们声明adsResults和organicResults空阵列，然后通过所有"ul > li > div"选择器（koude21方法）（koude20方法）进行和迭代（koude20方法）（koude20方法）（ koude23）：

let isAds = false;
const adsResults = [];
const organicResults = [];
Array.from(document.querySelectorAll("ul > li > div")).forEach((el) => {
    ...
});

因为结果元素标题和结果放在页面上相同的HTML元素中，我们需要获得标题，结果（使用koude24方法），并写一些条件以设置isAds variabe true或false（使用koude285方法），如果当前元素，则不是title，并且我们继续前进。

const title = el.querySelector(":scope > h2")?.textContent;
const result = el.querySelector("[data-testid='serp-ia-card']");
if (title && title.includes("Sponsored")) isAds = true;
if (title && !title.includes("Sponsored")) isAds = false;
if (!title && result) {
    ...
}

要使返回的结果对象我们需要定义link，fullAddress，categories，rating并在所有分辨率中获取thumbnails。然后，我们获得了最后一个分辨率链接 - 这是最好的：

const link = `https://www.yelp.com${result.querySelector("h3 a").getAttribute("href")}`;
const fullAddress = result.querySelector(`p > span:${isAds ? "last-child" : "nth-child(3)"}`)?.textContent.split(", ");
const thumbnails = result.querySelector(":scope > div > div:nth-child(1) a > img").getAttribute("srcset").split(", ");
const bestResolutionThumbnail = thumbnails[thumbnails.length - 1].split(" ")[0];
const categories = Array.from(result.querySelectorAll("p > span:nth-child(1) > a")).map((el) => {
  if (el.textContent === "more") return null;
  return {
    title: el.textContent,
    link: `https://www.yelp.com${el.getAttribute("href")}`,
  };
});
const rating = result.querySelector("span > div[role='img']")?.getAttribute("aria-label").split(" ")?.[0];

接下来，我们需要检查当前元素isAds我们是否获得并添加（koude37方法）页面的不同部分到adsResults数组，否则我们将结果添加到organicResults。我们可以使用下一个方法获得结果：

if (isAds) {
  adsResults.push({
    title: result.querySelector("h3 a").textContent,
    link,
    reviewsLink: `${decodeURIComponent(link.slice(link.indexOf("redirect_url") + 13, link.indexOf("&request_id")))}#reviews`,
    categories: categories.filter((el) => el),
    rating,
    reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
    address: fullAddress?.[0],
    neighborhoods: fullAddress?.[1],
    snippet: (
      result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
      result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
    )?.textContent
      .trim()
      .slice(0, -6),
    serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce((result, el) => {
      return {
        ...result,
        [`${el.querySelector("span > p")?.textContent}`]: el.querySelector("div > span[role='img']").classList.contains("icon--16-checkmark-v2"),
      };
    }, {}),
    thumbnail: bestResolutionThumbnail,
  });
} else {
  organicResults.push({
    title: result.querySelector("h3 a").textContent,
    link,
    reviewsLink: `${link}#reviews`,
    categories: categories.filter((el) => el),
    price: result.querySelector("p > span:nth-child(2)").textContent,
    rating,
    reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
    address: fullAddress?.[0],
    neighborhoods: fullAddress?.[1],
    snippet: (
      result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
      result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
    )?.textContent
      .trim()
      .slice(0, -6),
    serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce((result, el) => {
      return {
        ...result,
        [`${el.querySelector("span > p")?.textContent}`]: el.querySelector("div > span[role='img']").classList.contains("icon--16-checkmark-v2"),
      };
    }, {}),
    thumbnail: bestResolutionThumbnail,
  });
}

接下来，编写一个函数来控制浏览器并获取信息：

async function getOrganicResults() {
  ...
}

首先，在此功能中，我们需要使用带有当前options的puppeteer.launch({options})方法来定义browser，例如headless: true和args: ["--no-sandbox", "--disable-setuid-sandbox"]。

这些选项意味着我们将headless模式和数组与arguments一起使用，我们用来允许在线IDE中启动浏览器流程。然后我们打开一个新的page：

const browser = await puppeteer.launch({
  headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
  args: ["--no-sandbox", "--disable-setuid-sandbox"],
});

const page = await browser.newPage();

接下来，我们更改默认值（30 sec）等待选择器的时间到60000毫秒（1分钟）与koude55方法缓慢连接，请使用koude57方法访问URL，并定义results数组：

await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);

const results = [];

接下来，我们使用while loop（koude59），在其中使用koude61方法等待直到选择器加载，将页面的结果添加到adsResults和organicResults array（使用koude64和koude64和koude65）在页面上（koude66方法）中存在，并且结果的数量少于resultsLimit我们在下一页按钮元素上单击（koude68方法），等待3秒（使用koude69方法），否则我们停止循环（使用koude70）。 br>

while (true) {
  await page.waitForSelector("[data-testid='serp-ia-card']");
  const resultsFromPage = await getResultsFromPage(page);
  adsResults.push(...resultsFromPage.adsResults);
  organicResults.push(...resultsFromPage.organicResults);
  const isNextPage = await page.$("a[aria-label='Next']");
  if (!isNextPage || adsResults.length + organicResults.length >= resultsLimit) break;
  await page.click("a[aria-label='Next']");
  await page.waitForTimeout(3000);
}

最后，我们关闭浏览器，然后返回收到的数据：

await browser.close();

return { adsResults, organicResults };

现在我们可以启动我们的解析器：

$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file

输出

{
  "adsResults": [
    {
        "title":"Pizza Hut",
        "link":"https://www.yelp.com/adredir?ad_business_id=LuJTYRXHOuNBvmH2q_Pnhw&campaign_id=6a6Jneapwf8y0J5-2OI0UQ&click_origin=search_results&placement=above_search&placement_slot=0&redirect_url=https%3A%2F%2Fwww.yelp.com%2Fbiz%2Fpizza-hut-seattle-5&request_id=ed15d90f0b858297&signature=9a5b8ff57fd33b93fa11a7ccd12cfbf4268e7325970c0ac43238160bf2fe4f50&slot=0",
        "reviewsLink":"https://www.yelp.com/biz/pizza-hut-seattle-5#reviews",
        "categories":[
            {
              "title":"Chicken Wings",
              "link":"https://www.yelp.com/search?cflt=chicken_wings&find_loc=Seattle%2C+WA"
            },
            {
              "title":"Pizza",
              "link":"https://www.yelp.com/search?cflt=pizza&find_loc=Seattle%2C+WA"
            },
            {
              "title":"Fast Food",
              "link":"https://www.yelp.com/search?cflt=hotdogs&find_loc=Seattle%2C+WA"
            }
        ],
        "address":"",
        "snippet":"“We ordered a pizza and wings for carry out tonight with special instructions for how to cook the wings. When my husband picked up the order, the wings weren't right. I want to give…",
        "serviceOptions":{
            "Delivery":true,
            "Takeout":true
        },
        "thumbnail":"https://s3-media0.fl.yelpcdn.com/bphoto/c-BNJn-PnEQedrtt4NPvYw/1000s.jpg"
    },
    ...and other results
  ],
  "organicResults": [
    {
        "title":"KFC",
        "link":"https://www.yelp.com/biz/kfc-seattle-18?osq=kfc",
        "reviewsLink":"https://www.yelp.com/biz/kfc-seattle-18?osq=kfc#reviews",
        "categories":[
            {
                "title":"Fast Food",
                "link":"https://www.yelp.com/search?cflt=hotdogs&find_loc=Seattle%2C+WA"
            },
            {
                "title":"Chicken Wings",
                "link":"https://www.yelp.com/search?cflt=chicken_wings&find_loc=Seattle%2C+WA"
            },
            {
                "title":"Chicken Shop",
                "link":"https://www.yelp.com/search?cflt=chickenshop&find_loc=Seattle%2C+WA"
            },
            {
                "title":"more",
                "link":"https://www.yelp.com/biz/kfc-seattle-18?hrid=wRKhf8md_ru2OgAz1mrpRg&osq=kfc"
            }
        ],
        "price":"$",
        "rating":"2",
        "reviews":"54",
        "address":"KFC - Taco Bell",
        "neighborhoods":"Lower Queen Anne",
        "snippet":"“I have tried KFC much in India and it was first time in WA, Usa. It was good taste however not as good as Indian taste of KFC.",
        "serviceOptions":{
            "Delivery":true,
            "Takeout":true,
            "Curbside Pickup":true
        },
        "thumbnail":"https://s3-media0.fl.yelpcdn.com/bphoto/jrHdimlo2BO8wL49HXNDcQ/1000s.jpg"
    },
    ...and other results
  ]
}

使用yelp Organic和Ads结果来自serpapi

本节是为了显示DIY解决方案与我们的解决方案之间的比较。

最大的区别是您不需要从头开始创建解析器并维护它。

也有可能在Google的某个时候阻止请求，我们在后端处理它，因此无需弄清楚如何自己做或弄清楚要使用哪个验证码，代理提供商。

首先，我们需要安装koude71：

npm i google-search-results-nodejs

这是full code example，如果您不需要解释：

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com

const resultsLimit = 50; // hardcoded limit for demonstration purpose

const params = {
  engine: "yelp", // search engine
  device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
  find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
  find_desc: "kfc", // Parameter defines the query you want to search
};

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

const getResults = async () => {
  const adsResults = [];
  const organicResults = [];
  while (true) {
    const json = await getJson();
    if (json.organic_results) {
      if (json.ads_results) {
        adsResults.push(...json.ads_results);
      }
      organicResults.push(...json.organic_results);
      params.start ? (params.start += 10) : (params.start = 10);
    } else break;
    if (adsResults.length + organicResults.length >= resultsLimit) break;
  }
  return { adsResults, organicResults };
};

getResults().then((result) => console.dir(result, { depth: null }));

代码说明

首先，我们需要从koude71库中声明SerpApi，并使用SerpApi的API键定义新的search实例：

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);

接下来，我们编写用于提出请求的必要参数，并设置要接收多少结果（resultsLimit常数）：

const resultsLimit = 50; // hardcoded limit for demonstration purpose

const params = {
  engine: "yelp", // search engine
  device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
  find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
  find_desc: "kfc", // Parameter defines the query you want to search
};

接下来，我们从Serpapi库中包装搜索方法，以便进一步处理搜索结果：

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

最后，我们声明了从页面获取数据并返回的函数getResult：

const getResults = async () => {
  ...
};

在此功能中，我们需要声明空的adsResults和organicResults数组，并使用koude59循环获取json，添加ads_results和organic_results从每个页面中添加结果，并设置下一页启动索引（to params.start值）。如果页面上没有更多结果，或者收到的结果的数量更多，请temphoude67我们停止循环（使用koude70），并返回对象，结果：

const adsResults = [];
const organicResults = [];
while (true) {
  const json = await getJson();
  if (json.organic_results) {
    if (json.ads_results) {
      adsResults.push(...json.ads_results);
    }
    organicResults.push(...json.organic_results);
    params.start ? (params.start += 10) : (params.start = 10);
  } else break;
  if (adsResults.length + organicResults.length >= resultsLimit) break;
}
return { adsResults, organicResults };

之后，我们运行getResults函数并使用koude87方法在控制台中打印所有接收的信息，该方法允许您使用带有必要参数的对象来更改默认输出选项：

getResults().then((result) => console.dir(result, { depth: null }));

输出

{
  "adsResults": [
    {
      "block_position":"top",
      "place_ids":[
          "ThGZdWIyNOXUeTqMWRmVlw",
          "dudez-woodfired-pizza-seattle"
      ],
      "title":"DUDE’Z woodfired pizza",
      "link":"https://www.yelp.com/adredir?ad_business_id=ThGZdWIyNOXUeTqMWRmVlw&amp;campaign_id=KjSNa2u5Q-4tz8JKZAiYvg&amp;click_origin=search_results&amp;placement=above_search&amp;placement_slot=0&amp;redirect_url=https%3A%2F%2Fwww.yelp.com%2Fbiz%2Fdudez-woodfired-pizza-seattle&amp;request_id=e3869f3c027b5193&amp;signature=bbf93e3aaaae7762d2435d05e5fefee95c31e33c01f9b2b8aad85c660d7d5cfc&amp;slot=0",
      "reviews_link":"https://serpapi.com/search.json?engine=yelp_reviews&place_id=ThGZdWIyNOXUeTqMWRmVlw",
      "categories":[
          {
            "title":"Pizza",
            "link":"https://www.yelp.com/search?cflt=pizza&amp;find_loc=Seattle%2C+WA"
          }
      ],
      "rating":5,
      "reviews":1,
      "neighborhoods":"Cottage Grove",
      "phone":"(360) 803-1616",
      "snippet":"These pizzas are so delicious! The guys really take care of their customers. We ordered the Carne Asada Pizza and the awesome G Pop pizza (think jalapeño poppers in the shape of a…",
      "service_options":{
          "outdoor_seating":false,
          "delivery":true,
          "takeout":true
      },
      "thumbnail":"https://s3-media0.fl.yelpcdn.com/bphoto/cJynIXUZp0OWhSdW3AUoaw/348s.jpg"
    },
    ...and other results
  ],
  "organicResults":
  [
    {
      "position": 1,
      "place_ids": ["UON0MxZGG0cgsU5LYPjJbg", "kfc-seattle-18"],
      "title": "KFC",
      "link": "https://www.yelp.com/biz/kfc-seattle-18?osq=kfc",
      "reviews_link": "https://serpapi.com/search.json?engine=yelp_reviews&place_id=UON0MxZGG0cgsU5LYPjJbg",
      "categories": [
        {
          "title": "Fast Food",
          "link": "https://www.yelp.com/search?cflt=hotdogs&amp;find_loc=Seattle%2C+WA"
        },
        {
          "title": "Chicken Wings",
          "link": "https://www.yelp.com/search?cflt=chicken_wings&amp;find_loc=Seattle%2C+WA"
        },
        {
          "title": "Chicken Shop",
          "link": "https://www.yelp.com/search?cflt=chickenshop&amp;find_loc=Seattle%2C+WA"
        }
      ],
      "price": "$",
      "rating": 2,
      "reviews": 54,
      "address": "210 W Mercer St",
      "neighborhoods": "Lower Queen Anne",
      "phone": "(206) 283-7575",
      "snippet": "I have tried KFC much in India and it was first time in WA, Usa. It was good taste however not as good as Indian taste of KFC.",
      "service_options": {
        "delivery": true,
        "takeout": true,
        "curbside_pickup": true
      },
      "thumbnail": "https://s3-media0.fl.yelpcdn.com/bphoto/jrHdimlo2BO8wL49HXNDcQ/348s.jpg"
    },
    ...and other results
  ]
}

链接

如果您想在此博客文章中添加其他功能，或者您想查看Serpapi，write me a message的某些项目。

加入我们的Twitter | YouTube

添加一个Feature Requestð«或Bugð