网络刮擦Yelp有机和广告结果Nodejs
#node #webscraping #serpapi

将被刮擦

what

diy代码

如果您不需要解释,请看一下the full code example in the online IDE

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

const serchQuery = "kfc"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate
const resultsLimit = 50; // hardcoded limit for demonstration purpose

const searchParams = {
  query: encodeURI(serchQuery),
  location: encodeURI(location),
};

const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}`;

async function getResultsFromPage(page) {
  return await page.evaluate(() => {
    let isAds = false;
    const adsResults = [];
    const organicResults = [];
    Array.from(document.querySelectorAll("ul > li > div")).forEach((el) => {
      const title = el.querySelector(":scope > h2")?.textContent;
      const result = el.querySelector("[data-testid='serp-ia-card']");
      if (title && title.includes("Sponsored")) isAds = true;
      if (title && !title.includes("Sponsored")) isAds = false;
      if (!title && result) {
        const link = `https://www.yelp.com${result.querySelector("h3 a").getAttribute("href")}`;
        const fullAddress = result.querySelector(`p > span:${isAds ? "last-child" : "nth-child(3)"}`)?.textContent.split(", ");
        const thumbnails = result.querySelector(":scope > div > div:nth-child(1) a > img").getAttribute("srcset").split(", ");
        const bestResolutionThumbnail = thumbnails[thumbnails.length - 1].split(" ")[0];
        const categories = Array.from(result.querySelectorAll("p > span:nth-child(1) > a")).map((el) => {
          if (el.textContent === "more") return null;
          return {
            title: el.textContent,
            link: `https://www.yelp.com${el.getAttribute("href")}`,
          };
        });
        const rating = result.querySelector("span > div[role='img']")?.getAttribute("aria-label").split(" ")?.[0];
        if (isAds) {
          adsResults.push({
            title: result.querySelector("h3 a").textContent,
            link,
            reviewsLink: `${decodeURIComponent(link.slice(link.indexOf("redirect_url") + 13, link.indexOf("&request_id")))}#reviews`,
            categories: categories.filter((el) => el),
            rating,
            reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
            address: fullAddress?.[0],
            neighborhoods: fullAddress?.[1],
            snippet: (
              result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
              result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
            )?.textContent
              .trim()
              .slice(0, -6),
            serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce(
              (result, el) => {
                return {
                  ...result,
                  [`${el.querySelector("span > p")?.textContent}`]: el
                    .querySelector("div > span[role='img']")
                    .classList.contains("icon--16-checkmark-v2"),
                };
              },
              {}
            ),
            thumbnail: bestResolutionThumbnail,
          });
        } else {
          organicResults.push({
            title: result.querySelector("h3 a").textContent,
            link,
            reviewsLink: `${link}#reviews`,
            categories: categories.filter((el) => el),
            price: result.querySelector("p > span:nth-child(2)").textContent,
            rating,
            reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
            address: fullAddress?.[0],
            neighborhoods: fullAddress?.[1],
            snippet: (
              result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
              result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
            )?.textContent
              .trim()
              .slice(0, -6),
            serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce(
              (result, el) => {
                return {
                  ...result,
                  [`${el.querySelector("span > p")?.textContent}`]: el
                    .querySelector("div > span[role='img']")
                    .classList.contains("icon--16-checkmark-v2"),
                };
              },
              {}
            ),
            thumbnail: bestResolutionThumbnail,
          });
        }
      }
    });
    return { adsResults, organicResults };
  });
}

async function getOrganicResults() {
  const browser = await puppeteer.launch({
    headless: false, // if you want to see what the browser is doing, you need to change this option to "false"
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(URL);

  const adsResults = [];
  const organicResults = [];

  while (true) {
    await page.waitForSelector("[data-testid='serp-ia-card']");
    const resultsFromPage = await getResultsFromPage(page);
    adsResults.push(...resultsFromPage.adsResults);
    organicResults.push(...resultsFromPage.organicResults);
    const isNextPage = await page.$("a[aria-label='Next']");
    if (!isNextPage || adsResults.length + organicResults.length >= resultsLimit) break;
    await page.click("a[aria-label='Next']");
    await page.waitForTimeout(3000);
  }

  await browser.close();

  return { adsResults, organicResults };
}

getOrganicResults().then((result) => console.dir(result, { depth: null }));

准备

首先,我们需要创建一个node.js* project并添加koude0koude1koude2koude3以控制铬(或chrome或firefox,但现在我们仅在DevTools Protocol上使用铬在headless或无头模式中。

为此,在我们项目的目录中,打开命令行并输入:

$ npm init -y

,然后:

$ npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth

*如果您没有安装node.js,则可以download it from nodejs.org并遵循安装documentation

ð注意:另外,您可以使用puppeteer无需任何扩展即可,但是我强烈建议将其与puppeteer-extra一起使用puppeteer-extra-plugin-stealth,以防止您使用无头铬或正在使用web driver的网站检测。您可以在Chrome headless tests website上检查它。下面的屏幕截图显示了差异。

stealth

Process

我们需要从HTML元素中提取数据。通过SelectorGadget Chrome extension,获得合适的CSS选择器的过程非常容易,该过程能够通过单击浏览器中的所需元素来获取CSS选择器。但是,它并不总是完美地工作,尤其是当JavaScript大量使用该网站时。

如果您想了解更多有关它们的信息,我们在Serpapi上有专门的Web Scraping with CSS Selectors博客文章。

下面的GIF说明了使用Selectorgadget选择结果的不同部分的方法。

how

代码说明

声明koude1puppeteer-extra Library和koude9控制Chromium浏览器,以防止网站检测到您正在使用puppeteer-extra-plugin-stealth库中使用web driver

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

接下来,我们“说”对puppeteer使用StealthPlugin,写我们要搜索的内容(serchQuery常数),搜索位置,设置要接收的结果(reviewsLimit常数),搜索URL并使用koude15方法进行搜索参数:

puppeteer.use(StealthPlugin());

const serchQuery = "kfc"; //Parameter defines the query you want to search
const location = "Seattle, WA"; //Parameter defines from where you want the search to originate
const resultsLimit = 50; // hardcoded limit for demonstration purpose

const searchParams = {
  query: encodeURI(serchQuery),
  location: encodeURI(location),
};

const URL = `https://www.yelp.com/search?find_desc=${searchParams.query}&find_loc=${searchParams.location}`;

接下来,我们编写一个函数以从页面中获取有机结果:

async function getResultsFromPage(page) {
  ...
}

然后,我们从页面上下文(使用koude16方法)获取信息,然后将其保存在返回的对象中:

return await page.evaluate(() => ({
    ...
}));

接下来,我们需要声明isAds变量以不同于有机结果和ADS结果,然后我们声明adsResultsorganicResults空阵列,然后通过所有"ul > li > div"选择器(koude21方法)(koude20方法)进行和迭代(koude20方法)(koude20方法)( koude23):

let isAds = false;
const adsResults = [];
const organicResults = [];
Array.from(document.querySelectorAll("ul > li > div")).forEach((el) => {
    ...
});

因为结果元素标题和结果放在页面上相同的HTML元素中,我们需要获得标题,结果(使用koude24方法),并写一些条件以设置isAds variabe truefalse(使用koude285方法),如果当前元素,则不是title,并且我们继续前进。

const title = el.querySelector(":scope > h2")?.textContent;
const result = el.querySelector("[data-testid='serp-ia-card']");
if (title && title.includes("Sponsored")) isAds = true;
if (title && !title.includes("Sponsored")) isAds = false;
if (!title && result) {
    ...
}

要使返回的结果对象我们需要定义linkfullAddresscategoriesrating并在所有分辨率中获取thumbnails。然后,我们获得了最后一个分辨率链接 - 这是最好的:

const link = `https://www.yelp.com${result.querySelector("h3 a").getAttribute("href")}`;
const fullAddress = result.querySelector(`p > span:${isAds ? "last-child" : "nth-child(3)"}`)?.textContent.split(", ");
const thumbnails = result.querySelector(":scope > div > div:nth-child(1) a > img").getAttribute("srcset").split(", ");
const bestResolutionThumbnail = thumbnails[thumbnails.length - 1].split(" ")[0];
const categories = Array.from(result.querySelectorAll("p > span:nth-child(1) > a")).map((el) => {
  if (el.textContent === "more") return null;
  return {
    title: el.textContent,
    link: `https://www.yelp.com${el.getAttribute("href")}`,
  };
});
const rating = result.querySelector("span > div[role='img']")?.getAttribute("aria-label").split(" ")?.[0];

接下来,我们需要检查当前元素isAds我们是否获得并添加(koude37方法)页面的不同部分到adsResults数组,否则我们将结果添加到organicResults。我们可以使用下一个方法获得结果:

if (isAds) {
  adsResults.push({
    title: result.querySelector("h3 a").textContent,
    link,
    reviewsLink: `${decodeURIComponent(link.slice(link.indexOf("redirect_url") + 13, link.indexOf("&request_id")))}#reviews`,
    categories: categories.filter((el) => el),
    rating,
    reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
    address: fullAddress?.[0],
    neighborhoods: fullAddress?.[1],
    snippet: (
      result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
      result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
    )?.textContent
      .trim()
      .slice(0, -6),
    serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce((result, el) => {
      return {
        ...result,
        [`${el.querySelector("span > p")?.textContent}`]: el.querySelector("div > span[role='img']").classList.contains("icon--16-checkmark-v2"),
      };
    }, {}),
    thumbnail: bestResolutionThumbnail,
  });
} else {
  organicResults.push({
    title: result.querySelector("h3 a").textContent,
    link,
    reviewsLink: `${link}#reviews`,
    categories: categories.filter((el) => el),
    price: result.querySelector("p > span:nth-child(2)").textContent,
    rating,
    reviews: rating && result.querySelector("div > span:nth-child(2)")?.textContent,
    address: fullAddress?.[0],
    neighborhoods: fullAddress?.[1],
    snippet: (
      result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(2) p") ||
      result.querySelector(":scope > div > div:nth-child(2) > div:nth-child(3) p")
    )?.textContent
      .trim()
      .slice(0, -6),
    serviceOptions: Array.from(result.querySelectorAll(":scope > div > div:nth-child(2) > div:last-child li > div > div")).reduce((result, el) => {
      return {
        ...result,
        [`${el.querySelector("span > p")?.textContent}`]: el.querySelector("div > span[role='img']").classList.contains("icon--16-checkmark-v2"),
      };
    }, {}),
    thumbnail: bestResolutionThumbnail,
  });
}

接下来,编写一个函数来控制浏览器并获取信息:

async function getOrganicResults() {
  ...
}

首先,在此功能中,我们需要使用带有当前optionspuppeteer.launch({options})方法来定义browser,例如headless: trueargs: ["--no-sandbox", "--disable-setuid-sandbox"]

这些选项意味着我们将headless模式和数组与arguments一起使用,我们用来允许在线IDE中启动浏览器流程。然后我们打开一个新的page

const browser = await puppeteer.launch({
  headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
  args: ["--no-sandbox", "--disable-setuid-sandbox"],
});

const page = await browser.newPage();

接下来,我们更改默认值(30 sec)等待选择器的时间到60000毫秒(1分钟)与koude55方法缓慢连接,请使用koude57方法访问URL,并定义results数组:

await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);

const results = [];

接下来,我们使用while loop(koude59),在其中使用koude61方法等待直到选择器加载,将页面的结果添加到adsResultsorganicResults array(使用koude64koude64koude65)在页面上(koude66方法)中存在,并且结果的数量少于resultsLimit我们在下一页按钮元素上单击(koude68方法),等待3秒(使用koude69方法),否则我们停止循环(使用koude70)。 br>

while (true) {
  await page.waitForSelector("[data-testid='serp-ia-card']");
  const resultsFromPage = await getResultsFromPage(page);
  adsResults.push(...resultsFromPage.adsResults);
  organicResults.push(...resultsFromPage.organicResults);
  const isNextPage = await page.$("a[aria-label='Next']");
  if (!isNextPage || adsResults.length + organicResults.length >= resultsLimit) break;
  await page.click("a[aria-label='Next']");
  await page.waitForTimeout(3000);
}

最后,我们关闭浏览器,然后返回收到的数据:

await browser.close();

return { adsResults, organicResults };

现在我们可以启动我们的解析器:

$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file

输出

{
  "adsResults": [
    {
        "title":"Pizza Hut",
        "link":"https://www.yelp.com/adredir?ad_business_id=LuJTYRXHOuNBvmH2q_Pnhw&campaign_id=6a6Jneapwf8y0J5-2OI0UQ&click_origin=search_results&placement=above_search&placement_slot=0&redirect_url=https%3A%2F%2Fwww.yelp.com%2Fbiz%2Fpizza-hut-seattle-5&request_id=ed15d90f0b858297&signature=9a5b8ff57fd33b93fa11a7ccd12cfbf4268e7325970c0ac43238160bf2fe4f50&slot=0",
        "reviewsLink":"https://www.yelp.com/biz/pizza-hut-seattle-5#reviews",
        "categories":[
            {
              "title":"Chicken Wings",
              "link":"https://www.yelp.com/search?cflt=chicken_wings&find_loc=Seattle%2C+WA"
            },
            {
              "title":"Pizza",
              "link":"https://www.yelp.com/search?cflt=pizza&find_loc=Seattle%2C+WA"
            },
            {
              "title":"Fast Food",
              "link":"https://www.yelp.com/search?cflt=hotdogs&find_loc=Seattle%2C+WA"
            }
        ],
        "address":"",
        "snippet":"“We ordered a pizza and wings for carry out tonight with special instructions for how to cook the wings. When my husband picked up the order, the wings weren't right. I want to give…",
        "serviceOptions":{
            "Delivery":true,
            "Takeout":true
        },
        "thumbnail":"https://s3-media0.fl.yelpcdn.com/bphoto/c-BNJn-PnEQedrtt4NPvYw/1000s.jpg"
    },
    ...and other results
  ],
  "organicResults": [
    {
        "title":"KFC",
        "link":"https://www.yelp.com/biz/kfc-seattle-18?osq=kfc",
        "reviewsLink":"https://www.yelp.com/biz/kfc-seattle-18?osq=kfc#reviews",
        "categories":[
            {
                "title":"Fast Food",
                "link":"https://www.yelp.com/search?cflt=hotdogs&find_loc=Seattle%2C+WA"
            },
            {
                "title":"Chicken Wings",
                "link":"https://www.yelp.com/search?cflt=chicken_wings&find_loc=Seattle%2C+WA"
            },
            {
                "title":"Chicken Shop",
                "link":"https://www.yelp.com/search?cflt=chickenshop&find_loc=Seattle%2C+WA"
            },
            {
                "title":"more",
                "link":"https://www.yelp.com/biz/kfc-seattle-18?hrid=wRKhf8md_ru2OgAz1mrpRg&osq=kfc"
            }
        ],
        "price":"$",
        "rating":"2",
        "reviews":"54",
        "address":"KFC - Taco Bell",
        "neighborhoods":"Lower Queen Anne",
        "snippet":"“I have tried KFC much in India and it was first time in WA, Usa. It was good taste however not as good as Indian taste of KFC.",
        "serviceOptions":{
            "Delivery":true,
            "Takeout":true,
            "Curbside Pickup":true
        },
        "thumbnail":"https://s3-media0.fl.yelpcdn.com/bphoto/jrHdimlo2BO8wL49HXNDcQ/1000s.jpg"
    },
    ...and other results
  ]
}

使用yelp OrganicAds结果来自serpapi

本节是为了显示DIY解决方案与我们的解决方案之间的比较。

最大的区别是您不需要从头开始创建解析器并维护它。

也有可能在Google的某个时候阻止请求,我们在后端处理它,因此无需弄清楚如何自己做或弄清楚要使用哪个验证码,代理提供商。

首先,我们需要安装koude71

npm i google-search-results-nodejs

这是full code example,如果您不需要解释:

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com

const resultsLimit = 50; // hardcoded limit for demonstration purpose

const params = {
  engine: "yelp", // search engine
  device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
  find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
  find_desc: "kfc", // Parameter defines the query you want to search
};

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

const getResults = async () => {
  const adsResults = [];
  const organicResults = [];
  while (true) {
    const json = await getJson();
    if (json.organic_results) {
      if (json.ads_results) {
        adsResults.push(...json.ads_results);
      }
      organicResults.push(...json.organic_results);
      params.start ? (params.start += 10) : (params.start = 10);
    } else break;
    if (adsResults.length + organicResults.length >= resultsLimit) break;
  }
  return { adsResults, organicResults };
};

getResults().then((result) => console.dir(result, { depth: null }));

代码说明

首先,我们需要从koude71库中声明SerpApi,并使用SerpApi的API键定义新的search实例:

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);

接下来,我们编写用于提出请求的必要参数,并设置要接收多少结果(resultsLimit常数):

const resultsLimit = 50; // hardcoded limit for demonstration purpose

const params = {
  engine: "yelp", // search engine
  device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
  find_loc: "Seattle, WA", //Parameter defines from where you want the search to originate.
  find_desc: "kfc", // Parameter defines the query you want to search
};

接下来,我们从Serpapi库中包装搜索方法,以便进一步处理搜索结果:

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

最后,我们声明了从页面获取数据并返回的函数getResult

const getResults = async () => {
  ...
};

在此功能中,我们需要声明空的adsResultsorganicResults数组,并使用koude59循环获取json,添加ads_resultsorganic_results从每个页面中添加结果,并设置下一页启动索引(to params.start值)。如果页面上没有更多结果,或者收到的结果的数量更多,请temphoude67我们停止循环(使用koude70),并返回对象,结果:

const adsResults = [];
const organicResults = [];
while (true) {
  const json = await getJson();
  if (json.organic_results) {
    if (json.ads_results) {
      adsResults.push(...json.ads_results);
    }
    organicResults.push(...json.organic_results);
    params.start ? (params.start += 10) : (params.start = 10);
  } else break;
  if (adsResults.length + organicResults.length >= resultsLimit) break;
}
return { adsResults, organicResults };

之后,我们运行getResults函数并使用koude87方法在控制台中打印所有接收的信息,该方法允许您使用带有必要参数的对象来更改默认输出选项:

getResults().then((result) => console.dir(result, { depth: null }));

输出

{
  "adsResults": [
    {
      "block_position":"top",
      "place_ids":[
          "ThGZdWIyNOXUeTqMWRmVlw",
          "dudez-woodfired-pizza-seattle"
      ],
      "title":"DUDE’Z woodfired pizza",
      "link":"https://www.yelp.com/adredir?ad_business_id=ThGZdWIyNOXUeTqMWRmVlw&campaign_id=KjSNa2u5Q-4tz8JKZAiYvg&click_origin=search_results&placement=above_search&placement_slot=0&redirect_url=https%3A%2F%2Fwww.yelp.com%2Fbiz%2Fdudez-woodfired-pizza-seattle&request_id=e3869f3c027b5193&signature=bbf93e3aaaae7762d2435d05e5fefee95c31e33c01f9b2b8aad85c660d7d5cfc&slot=0",
      "reviews_link":"https://serpapi.com/search.json?engine=yelp_reviews&place_id=ThGZdWIyNOXUeTqMWRmVlw",
      "categories":[
          {
            "title":"Pizza",
            "link":"https://www.yelp.com/search?cflt=pizza&find_loc=Seattle%2C+WA"
          }
      ],
      "rating":5,
      "reviews":1,
      "neighborhoods":"Cottage Grove",
      "phone":"(360) 803-1616",
      "snippet":"These pizzas are so delicious! The guys really take care of their customers. We ordered the Carne Asada Pizza and the awesome G Pop pizza (think jalapeño poppers in the shape of a…",
      "service_options":{
          "outdoor_seating":false,
          "delivery":true,
          "takeout":true
      },
      "thumbnail":"https://s3-media0.fl.yelpcdn.com/bphoto/cJynIXUZp0OWhSdW3AUoaw/348s.jpg"
    },
    ...and other results
  ],
  "organicResults":
  [
    {
      "position": 1,
      "place_ids": ["UON0MxZGG0cgsU5LYPjJbg", "kfc-seattle-18"],
      "title": "KFC",
      "link": "https://www.yelp.com/biz/kfc-seattle-18?osq=kfc",
      "reviews_link": "https://serpapi.com/search.json?engine=yelp_reviews&place_id=UON0MxZGG0cgsU5LYPjJbg",
      "categories": [
        {
          "title": "Fast Food",
          "link": "https://www.yelp.com/search?cflt=hotdogs&find_loc=Seattle%2C+WA"
        },
        {
          "title": "Chicken Wings",
          "link": "https://www.yelp.com/search?cflt=chicken_wings&find_loc=Seattle%2C+WA"
        },
        {
          "title": "Chicken Shop",
          "link": "https://www.yelp.com/search?cflt=chickenshop&find_loc=Seattle%2C+WA"
        }
      ],
      "price": "$",
      "rating": 2,
      "reviews": 54,
      "address": "210 W Mercer St",
      "neighborhoods": "Lower Queen Anne",
      "phone": "(206) 283-7575",
      "snippet": "I have tried KFC much in India and it was first time in WA, Usa. It was good taste however not as good as Indian taste of KFC.",
      "service_options": {
        "delivery": true,
        "takeout": true,
        "curbside_pickup": true
      },
      "thumbnail": "https://s3-media0.fl.yelpcdn.com/bphoto/jrHdimlo2BO8wL49HXNDcQ/348s.jpg"
    },
    ...and other results
  ]
}

链接

如果您想在此博客文章中添加其他功能,或者您想查看Serpapi,write me a message的某些项目。


加入我们的Twitter | YouTube

添加一个Feature Requestð«或Bugð