当前,我们没有支持从Google Flights页面中提取数据的API。这篇博客文章是向您展示如何通过提供的DIY解决方案自己做的方式。
将被刮擦
ð注意:我向您显示的解决方案仅获得“单程”,“ 1人”和“经济”选项的飞行结果。
如果您不需要解释,请看一下the full code example in the online IDE
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const from = "Seattle";
const to = "Las Vegas";
const leaveDate = "5-15-2023"; // mm-dd-yyyy format
const URL = `https://www.google.com/travel/flights?hl=en-US&curr=USD`;
async function getFlightsFromPage(page) {
return await page.evaluate(() =>
Array.from(document.querySelectorAll(".pIav2d")).map((el) => {
const thumbnailString = el.querySelector(".EbY4Pc")?.getAttribute("style");
const startIndex = thumbnailString?.indexOf("url(");
const endIndex = thumbnailString?.indexOf(";");
const thumbnail = thumbnailString?.slice(startIndex + 4, endIndex - 1).replaceAll("\\", "") || "No thumbnail";
const layover = el.querySelector(".BbR8Ec .sSHqwe")?.getAttribute("aria-label");
return {
thumbnail,
companyName: el.querySelector(".Ir0Voe .sSHqwe")?.textContent.trim(),
description: el.querySelector(".mv1WYe")?.getAttribute("aria-label"),
duration: el.querySelector(".gvkrdb")?.textContent.trim(),
airportLeave: el.querySelectorAll(".Ak5kof .sSHqwe .eoY5cb")[0]?.textContent.trim(),
airportArive: el.querySelectorAll(".Ak5kof .sSHqwe .eoY5cb")[1]?.textContent.trim(),
layover: layover || "Nonstop",
emisions: el.querySelector(".V1iAHe > div")?.getAttribute("aria-label").replace(". Learn more about this emissions estimate", " "),
price: el.querySelector(".U3gSDe .YMlIz > span")?.textContent.trim(),
priceDescription: el.querySelector(".U3gSDe .JMnxgf > span > span > span")?.getAttribute("aria-label"),
};
})
);
}
async function getFlightsResults() {
const browser = await puppeteer.launch({
headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
page.setViewport({
width: 1280,
height: 720,
});
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector(".e5F5td");
const inputs = await page.$$(".e5F5td");
// type "from"
await inputs[0].click();
await page.waitForTimeout(1000);
await page.keyboard.type(from);
await page.keyboard.press("Enter");
// type "to"
await inputs[1].click();
await page.waitForTimeout(1000);
await page.keyboard.type(to);
await page.waitForTimeout(1000);
await page.keyboard.press("Enter");
await page.waitForTimeout(1000);
// type "Leave date"
await page.click(".rIZzse .d5wCYc");
await page.waitForTimeout(1000);
await page.keyboard.type(leaveDate);
await page.waitForTimeout(1000);
await page.keyboard.press("Enter");
await page.waitForTimeout(1000);
// choose "One way"
await page.click(".UGrfjc .VfPpkd-RLmnJb");
await page.waitForTimeout(1000);
await page.click(".VfPpkd-qPzbhe-JNdkSc > li:last-child");
await page.waitForTimeout(1000);
// press "Done"
await page.click(".A8nfpe .akjk5c .VfPpkd-vQzf8d");
await page.waitForTimeout(1000);
await page.keyboard.press("Enter");
// press "Search"
await page.waitForTimeout(1000);
await page.keyboard.press("Enter");
await page.waitForSelector(".pIav2d");
const moreButton = await page.$(".XsapA");
if (moreButton) {
await moreButton.click();
await page.waitForTimeout(2000);
}
const flights = await getFlightsFromPage(page);
await browser.close();
return flights;
}
getFlightsResults().then(console.log);
准备
首先,我们需要创建一个node.js* project并添加koude0包koude1,koude2和koude3以控制铬(或chrome或firefox,但现在我们仅在DevTools Protocol上使用铬在headless或无头模式中。
为此,在我们项目的目录中,打开命令行并输入:
$ npm init -y
,然后:
$ npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth
*如果您没有安装node.js,则可以download it from nodejs.org并遵循安装documentation。
ð注意:另外,您可以使用puppeteer
而无需任何扩展,但是我强烈建议将其与puppeteer-extra
一起使用puppeteer-extra-plugin-stealth
,以防止网站检测到您使用的无头铬或正在使用web driver。您可以在Chrome headless tests website上检查它。下面的屏幕截图显示了差异。
目前,我们完成了项目的设置node.js环境,然后转到分步代码解释。
Process
我们需要从HTML元素中提取数据。通过SelectorGadget Chrome extension,获得合适的CSS选择器的过程非常容易,该过程可以通过单击浏览器中的所需元素来获取CSS选择器。但是,它并不总是完美地工作,尤其是当JavaScript大量使用该网站时。
如果您想了解更多有关它们的信息,我们在Serpapi上有专门的Web Scraping with CSS Selectors博客文章。
下面的GIF说明了使用Selectorgadget选择结果的不同部分的方法。
代码说明
声明koude1从puppeteer-extra
库和koude9控制Chromium浏览器,以防止网站检测您正在从puppeteer-extra-plugin-stealth
库中使用web driver:
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
接下来,我们对puppeteer
“说”使用StealthPlugin
,写出出发城(from
)和目的地(to
)和出发日期(leaveDate
)和搜索URL:
puppeteer.use(StealthPlugin());
const from = "Seattle";
const to = "Las Vegas";
const leaveDate = "5-15-2023"; // mm-dd-yyyy format
const URL = `https://www.google.com/travel/flights?hl=en-US&curr=USD`;
接下来,我们编写一个函数以获取页面的位置:
async function getFlightsFromPage(page) {
...
}
在此功能中,我们将使用下一个方法和属性获取必要的信息:
首先,我们需要从页面上获取缩略图URL。为此,我们从包含URL的".EbY4Pc"
选择器中获取style
属性。然后,我们找到与"url("
和endIndex
匹配的startIndex
,与";"
匹配并切割缩略图网址:
const thumbnailString = el.querySelector(".EbY4Pc")?.getAttribute("style");
const startIndex = thumbnailString?.indexOf("url(");
const endIndex = thumbnailString?.indexOf(";");
const thumbnail = thumbnailString?.slice(startIndex + 4, endIndex - 1).replaceAll("\\", "") || "No thumbnail";
接下来,如果存在,我们需要获取中间信息:
const layover = el.querySelector(".BbR8Ec .sSHqwe")?.getAttribute("aria-label");
然后,我们从页面中获取并返回所有航班信息:
return {
thumbnail,
companyName: el.querySelector(".Ir0Voe .sSHqwe")?.textContent.trim(),
description: el.querySelector(".mv1WYe")?.getAttribute("aria-label"),
duration: el.querySelector(".gvkrdb")?.textContent.trim(),
airportLeave: el.querySelectorAll(".Ak5kof .sSHqwe .eoY5cb")[0]?.textContent.trim(),
airportArive: el.querySelectorAll(".Ak5kof .sSHqwe .eoY5cb")[1]?.textContent.trim(),
layover: layover || "Nonstop",
emisions: el.querySelector(".V1iAHe > div")?.getAttribute("aria-label").replace(". Learn more about this emissions estimate", " "),
price: el.querySelector(".U3gSDe .YMlIz > span")?.textContent.trim(),
priceDescription: el.querySelector(".U3gSDe .JMnxgf > span > span > span")?.getAttribute("aria-label"),
};
接下来,我们编写一个函数来控制浏览器,并从每个类别中获取信息:
async function getFlightsResults() {
...
}
首先,在此功能中,我们需要使用具有当前options
的puppeteer.launch({options})
方法来定义browser
,例如headless: true
和args: ["--no-sandbox", "--disable-setuid-sandbox"]
。
这些选项意味着我们将headless模式和数组与arguments一起使用,我们用来允许在线IDE启动浏览器过程。然后我们打开一个新的page
并将视口大小设置为等于1280x720
像素:
const browser = await puppeteer.launch({
headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
page.setViewport({
width: 1280,
height: 720,
});
接下来,我们更改默认值(30 sec)等待选择器的时间到60000毫秒(1分钟),以使用koude39方法慢速互联网连接,请使用koude41方法访问URL
:
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
然后,我们等待加载".e5F5td"
选择器(方法koude43),获取输入字段from
和to
单击它们,然后输入from
和to
Cities(方法koude48),按打字,然后输入leaveDate
和“单路票”选项:
await page.waitForSelector(".e5F5td");
const inputs = await page.$$(".e5F5td");
// type "from"
await inputs[0].click();
await page.waitForTimeout(1000);
await page.keyboard.type(from);
await page.keyboard.press("Enter");
// type "to"
await inputs[1].click();
await page.waitForTimeout(1000);
await page.keyboard.type(to);
await page.waitForTimeout(1000);
await page.keyboard.press("Enter");
await page.waitForTimeout(1000);
// type "Leave date"
await page.click(".rIZzse .d5wCYc");
await page.waitForTimeout(1000);
await page.keyboard.type(leaveDate);
await page.waitForTimeout(1000);
await page.keyboard.press("Enter");
await page.waitForTimeout(1000);
// choose "One way"
await page.click(".UGrfjc .VfPpkd-RLmnJb");
await page.waitForTimeout(1000);
await page.click(".VfPpkd-qPzbhe-JNdkSc > li:last-child");
await page.waitForTimeout(1000);
// press "Done"
await page.click(".A8nfpe .akjk5c .VfPpkd-vQzf8d");
await page.waitForTimeout(1000);
await page.keyboard.press("Enter");
填写所有字段后,我们按“ Enter”按钮,然后等待大量飞行结果。单击“显示更多”按钮,然后将航班结果保存到flights
常数:
// press "Search"
await page.waitForTimeout(1000);
await page.keyboard.press("Enter");
await page.waitForSelector(".pIav2d");
const moreButton = await page.$(".XsapA");
if (moreButton) {
await moreButton.click();
await page.waitForTimeout(2000);
}
const flights = await getFlightsFromPage(page);
最后,我们关闭浏览器,然后返回收到的数据:
await browser.close();
return flights;
现在我们可以启动解析器:
$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file
输出
[
{
"thumbnail":"https://www.gstatic.com/flights/airline_logos/70px/AA.png",
"companyName":"American",
"description":"Leaves Seattle-Tacoma International Airport at 11:55 PM on Monday, May 15 and arrives at Harry Reid International Airport at 5:54 PM on Tuesday, May 16.",
"duration":"17 hr 59 min",
"airportLeave":"Seattle-Tacoma International Airport",
"airportArive":"Harry Reid International Airport",
"layover":"Layover (1 of 1) is a 11 hr 4 min layover at Dallas/Fort Worth International Airport in Dallas.",
"emisions":"Carbon emissions estimate: 315 kilograms. +184% emissions ",
"price":"$318"
},
{
"thumbnail":"https://www.gstatic.com/flights/airline_logos/70px/AS.png",
"companyName":"Alaska",
"description":"Leaves Seattle-Tacoma International Airport at 7:10 PM on Monday, May 15 and arrives at Harry Reid International Airport at 4:36 PM on Tuesday, May 16.",
"duration":"21 hr 26 min",
"airportLeave":"Seattle-Tacoma International Airport",
"airportArive":"Harry Reid International Airport",
"layover":"Layover (1 of 1) is a 17 hr 42 min overnight layover at San Francisco International Airport in San Francisco.",
"emisions":"Carbon emissions estimate: 176 kilograms. +59% emissions ",
"price":"$323"
}
... and other flights results
]
如果您想在此博客文章中添加其他功能,或者您想查看使用Serpapi,write me a message制造的一些项目。
添加一个Feature Requestð«或Bugð