网络用nodejs刮擦Google航班
#node #webscraping #google

当前,我们没有支持从Google Flights页面中提取数据的API。这篇博客文章是向您展示如何通过提供的DIY解决方案自己做的方式。

将被刮擦

what

ð注意:我向您显示的解决方案仅获得“单程”,“ 1人”和“经济”选项的飞行结果。

image

如果您不需要解释,请看一下the full code example in the online IDE

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

const from = "Seattle";
const to = "Las Vegas";
const leaveDate = "5-15-2023"; // mm-dd-yyyy format

const URL = `https://www.google.com/travel/flights?hl=en-US&curr=USD`;

async function getFlightsFromPage(page) {
  return await page.evaluate(() =>
    Array.from(document.querySelectorAll(".pIav2d")).map((el) => {
      const thumbnailString = el.querySelector(".EbY4Pc")?.getAttribute("style");
      const startIndex = thumbnailString?.indexOf("url(");
      const endIndex = thumbnailString?.indexOf(";");
      const thumbnail = thumbnailString?.slice(startIndex + 4, endIndex - 1).replaceAll("\\", "") || "No thumbnail";
      const layover = el.querySelector(".BbR8Ec .sSHqwe")?.getAttribute("aria-label");
      return {
        thumbnail,
        companyName: el.querySelector(".Ir0Voe .sSHqwe")?.textContent.trim(),
        description: el.querySelector(".mv1WYe")?.getAttribute("aria-label"),
        duration: el.querySelector(".gvkrdb")?.textContent.trim(),
        airportLeave: el.querySelectorAll(".Ak5kof .sSHqwe .eoY5cb")[0]?.textContent.trim(),
        airportArive: el.querySelectorAll(".Ak5kof .sSHqwe .eoY5cb")[1]?.textContent.trim(),
        layover: layover || "Nonstop",
        emisions: el.querySelector(".V1iAHe > div")?.getAttribute("aria-label").replace(". Learn more about this emissions estimate", " "),
        price: el.querySelector(".U3gSDe .YMlIz > span")?.textContent.trim(),
        priceDescription: el.querySelector(".U3gSDe .JMnxgf > span > span > span")?.getAttribute("aria-label"),
      };
    })
  );
}

async function getFlightsResults() {
  const browser = await puppeteer.launch({
    headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();
  page.setViewport({
    width: 1280,
    height: 720,
  });

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(URL);

  await page.waitForSelector(".e5F5td");
  const inputs = await page.$$(".e5F5td");
  // type "from"
  await inputs[0].click();
  await page.waitForTimeout(1000);
  await page.keyboard.type(from);
  await page.keyboard.press("Enter");
  // type "to"
  await inputs[1].click();
  await page.waitForTimeout(1000);
  await page.keyboard.type(to);
  await page.waitForTimeout(1000);
  await page.keyboard.press("Enter");
  await page.waitForTimeout(1000);
  // type "Leave date"
  await page.click(".rIZzse .d5wCYc");
  await page.waitForTimeout(1000);
  await page.keyboard.type(leaveDate);
  await page.waitForTimeout(1000);
  await page.keyboard.press("Enter");
  await page.waitForTimeout(1000);
  // choose "One way"
  await page.click(".UGrfjc .VfPpkd-RLmnJb");
  await page.waitForTimeout(1000);
  await page.click(".VfPpkd-qPzbhe-JNdkSc > li:last-child");
  await page.waitForTimeout(1000);
  // press "Done"
  await page.click(".A8nfpe .akjk5c  .VfPpkd-vQzf8d");
  await page.waitForTimeout(1000);
  await page.keyboard.press("Enter");
  // press "Search"
  await page.waitForTimeout(1000);
  await page.keyboard.press("Enter");

  await page.waitForSelector(".pIav2d");

  const moreButton = await page.$(".XsapA");
  if (moreButton) {
    await moreButton.click();
    await page.waitForTimeout(2000);
  }

  const flights = await getFlightsFromPage(page);

  await browser.close();

  return flights;
}

getFlightsResults().then(console.log);

准备

首先,我们需要创建一个node.js* project并添加koude0koude1koude2koude3以控制铬(或chrome或firefox,但现在我们仅在DevTools Protocol上使用铬在headless或无头模式中。

为此,在我们项目的目录中,打开命令行并输入:

$ npm init -y

,然后:

$ npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth

*如果您没有安装node.js,则可以download it from nodejs.org并遵循安装documentation

ð注意:另外,您可以使用puppeteer而无需任何扩展,但是我强烈建议将其与puppeteer-extra一起使用puppeteer-extra-plugin-stealth,以防止网站检测到您使用的无头铬或正在使用web driver。您可以在Chrome headless tests website上检查它。下面的屏幕截图显示了差异。

stealth

目前,我们完成了项目的设置node.js环境,然后转到分步代码解释。

Process

我们需要从HTML元素中提取数据。通过SelectorGadget Chrome extension,获得合适的CSS选择器的过程非常容易,该过程可以通过单击浏览器中的所需元素来获取CSS选择器。但是,它并不总是完美地工作,尤其是当JavaScript大量使用该网站时。

如果您想了解更多有关它们的信息,我们在Serpapi上有专门的Web Scraping with CSS Selectors博客文章。

下面的GIF说明了使用Selectorgadget选择结果的不同部分的方法。

how

代码说明

声明koude1puppeteer-extra库和koude9控制Chromium浏览器,以防止网站检测您正在从puppeteer-extra-plugin-stealth库中使用web driver

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

接下来,我们对puppeteer“说”使用StealthPlugin,写出出发城(from)和目的地(to)和出发日期(leaveDate)和搜索URL:

puppeteer.use(StealthPlugin());

const from = "Seattle";
const to = "Las Vegas";
const leaveDate = "5-15-2023"; // mm-dd-yyyy format

const URL = `https://www.google.com/travel/flights?hl=en-US&curr=USD`;

接下来,我们编写一个函数以获取页面的位置:

async function getFlightsFromPage(page) {
  ...
}

在此功能中,我们将使用下一个方法和属性获取必要的信息:

首先,我们需要从页面上获取缩略图URL。为此,我们从包含URL的".EbY4Pc"选择器中获取style属性。然后,我们找到与"url("endIndex匹配的startIndex,与";"匹配并切割缩略图网址:

const thumbnailString = el.querySelector(".EbY4Pc")?.getAttribute("style");
const startIndex = thumbnailString?.indexOf("url(");
const endIndex = thumbnailString?.indexOf(";");
const thumbnail = thumbnailString?.slice(startIndex + 4, endIndex - 1).replaceAll("\\", "") || "No thumbnail";

接下来,如果存在,我们需要获取中间信息:

const layover = el.querySelector(".BbR8Ec .sSHqwe")?.getAttribute("aria-label");

然后,我们从页面中获取并返回所有航班信息:

return {
  thumbnail,
  companyName: el.querySelector(".Ir0Voe .sSHqwe")?.textContent.trim(),
  description: el.querySelector(".mv1WYe")?.getAttribute("aria-label"),
  duration: el.querySelector(".gvkrdb")?.textContent.trim(),
  airportLeave: el.querySelectorAll(".Ak5kof .sSHqwe .eoY5cb")[0]?.textContent.trim(),
  airportArive: el.querySelectorAll(".Ak5kof .sSHqwe .eoY5cb")[1]?.textContent.trim(),
  layover: layover || "Nonstop",
  emisions: el.querySelector(".V1iAHe > div")?.getAttribute("aria-label").replace(". Learn more about this emissions estimate", " "),
  price: el.querySelector(".U3gSDe .YMlIz > span")?.textContent.trim(),
  priceDescription: el.querySelector(".U3gSDe .JMnxgf > span > span > span")?.getAttribute("aria-label"),
};

接下来,我们编写一个函数来控制浏览器,并从每个类别中获取信息:

async function getFlightsResults() {
  ...
}

首先,在此功能中,我们需要使用具有当前optionspuppeteer.launch({options})方法来定义browser,例如headless: trueargs: ["--no-sandbox", "--disable-setuid-sandbox"]

这些选项意味着我们将headless模式和数组与arguments一起使用,我们用来允许在线IDE启动浏览器过程。然后我们打开一个新的page并将视口大小设置为等于1280x720像素:

const browser = await puppeteer.launch({
  headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
  args: ["--no-sandbox", "--disable-setuid-sandbox"],
});

const page = await browser.newPage();
page.setViewport({
  width: 1280,
  height: 720,
});

接下来,我们更改默认值(30 sec)等待选择器的时间到60000毫秒(1分钟),以使用koude39方法慢速互联网连接,请使用koude41方法访问URL

await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);

然后,我们等待加载".e5F5td"选择器(方法koude43),获取输入字段fromto单击它们,然后输入fromto Cities(方法koude48),按打字,然后输入leaveDate和“单路票”选项:

await page.waitForSelector(".e5F5td");
const inputs = await page.$$(".e5F5td");
// type "from"
await inputs[0].click();
await page.waitForTimeout(1000);
await page.keyboard.type(from);
await page.keyboard.press("Enter");
// type "to"
await inputs[1].click();
await page.waitForTimeout(1000);
await page.keyboard.type(to);
await page.waitForTimeout(1000);
await page.keyboard.press("Enter");
await page.waitForTimeout(1000);
// type "Leave date"
await page.click(".rIZzse .d5wCYc");
await page.waitForTimeout(1000);
await page.keyboard.type(leaveDate);
await page.waitForTimeout(1000);
await page.keyboard.press("Enter");
await page.waitForTimeout(1000);
// choose "One way"
await page.click(".UGrfjc .VfPpkd-RLmnJb");
await page.waitForTimeout(1000);
await page.click(".VfPpkd-qPzbhe-JNdkSc > li:last-child");
await page.waitForTimeout(1000);
// press "Done"
await page.click(".A8nfpe .akjk5c  .VfPpkd-vQzf8d");
await page.waitForTimeout(1000);
await page.keyboard.press("Enter");

填写所有字段后,我们按“ Enter”按钮,然后等待大量飞行结果。单击“显示更多”按钮,然后将航班结果保存到flights常数:

// press "Search"
await page.waitForTimeout(1000);
await page.keyboard.press("Enter");

await page.waitForSelector(".pIav2d");

const moreButton = await page.$(".XsapA");
if (moreButton) {
  await moreButton.click();
  await page.waitForTimeout(2000);
}

const flights = await getFlightsFromPage(page);

最后,我们关闭浏览器,然后返回收到的数据:

await browser.close();

return flights;

现在我们可以启动解析器:

$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file

输出

[
   {
      "thumbnail":"https://www.gstatic.com/flights/airline_logos/70px/AA.png",
      "companyName":"American",
      "description":"Leaves Seattle-Tacoma International Airport at 11:55 PM on Monday, May 15 and arrives at Harry Reid International Airport at 5:54 PM on Tuesday, May 16.",
      "duration":"17 hr 59 min",
      "airportLeave":"Seattle-Tacoma International Airport",
      "airportArive":"Harry Reid International Airport",
      "layover":"Layover (1 of 1) is a 11 hr 4 min layover at Dallas/Fort Worth International Airport in Dallas.",
      "emisions":"Carbon emissions estimate: 315 kilograms. +184% emissions ",
      "price":"$318"
   },
   {
      "thumbnail":"https://www.gstatic.com/flights/airline_logos/70px/AS.png",
      "companyName":"Alaska",
      "description":"Leaves Seattle-Tacoma International Airport at 7:10 PM on Monday, May 15 and arrives at Harry Reid International Airport at 4:36 PM on Tuesday, May 16.",
      "duration":"21 hr 26 min",
      "airportLeave":"Seattle-Tacoma International Airport",
      "airportArive":"Harry Reid International Airport",
      "layover":"Layover (1 of 1) is a 17 hr 42 min overnight layover at San Francisco International Airport in San Francisco.",
      "emisions":"Carbon emissions estimate: 176 kilograms. +59% emissions ",
      "price":"$323"
   }
   ... and other flights results
]

如果您想在此博客文章中添加其他功能,或者您想查看使用Serpapi,write me a message制造的一些项目。


加入我们的Twitter | YouTube

添加一个Feature Requestð«或Bugð