将被刮擦
完整代码
如果您不需要解释,请看一下the full code example in the online IDE
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const searchParams = {
hl: "en", // Parameter defines the language to use for the Google search
gl: "us", // parameter defines the country to use for the Google search
device: "phone", // parameter defines the search device. Options: phone, tablet, tv, chromebook
category: null, // you can see the full list of supported categories on https://serpapi.com/google-play-books-categories
};
const URL = searchParams.category
? `https://play.google.com/store/books/category/${searchParams.category}?hl=${searchParams.hl}&gl=${searchParams.gl}&device=${searchParams.device}`
: `https://play.google.com/store/books?hl=${searchParams.hl}&gl=${searchParams.gl}&device=${searchParams.device}`;
async function scrollPage(page, scrollContainer) {
let lastHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
while (true) {
await page.evaluate(`window.scrollTo(0, document.querySelector("${scrollContainer}").scrollHeight)`);
await page.waitForTimeout(4000);
let newHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
if (newHeight === lastHeight) {
break;
}
lastHeight = newHeight;
}
}
async function getBooksFromPage(page) {
const books = await page.evaluate(() => {
const mainPageInfo = Array.from(document.querySelectorAll("section .oVnAB")).reduce((result, block) => {
const categoryTitle = block.querySelector(".kcen6d").textContent.trim();
const books = Array.from(block.parentElement.querySelectorAll(".ULeU3b")).map((book) => {
const link = `https://play.google.com${book.querySelector(".Si6A0c")?.getAttribute("href")}`;
const bookId = link.slice(link.indexOf("?id=") + 4);
return {
title: book.querySelector(".hP61id div:first-child")?.getAttribute("title"),
link,
rating: parseFloat(book.querySelector(".LrNMN[aria-label]")?.getAttribute("aria-label").slice(6, 9)) || "No rating",
originalPrice: book.querySelector(".LrNMN .SUZt4c")?.textContent.trim(),
price: book.querySelector(".LrNMN .VfPpfd")?.textContent.trim(),
thumbnail: book.querySelector(".TjRVLb img")?.getAttribute("srcset").slice(0, -3),
bookId,
};
});
return {
...result,
[categoryTitle]: books,
};
}, {});
return mainPageInfo;
});
return books;
}
async function getMainPageInfo() {
const browser = await puppeteer.launch({
headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector(".oVnAB");
await scrollPage(page, ".T4LgNb");
const books = await getBooksFromPage(page);
await browser.close();
return books;
}
getMainPageInfo().then((result) => console.dir(result, { depth: null }));
准备
首先,我们需要创建一个node.js* project并添加koude0包koude1,koude2和koude3以控制铬(或chrome或firefox,但现在我们仅在DevTools Protocol上使用铬在headless或无头模式中。
在使用我们的项目的目录中,打开命令行并输入npm init -y
,然后是npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth
。
*如果您没有安装node.js,则可以download it from nodejs.org并遵循安装documentation。
ð注意:另外,您可以使用puppeteer
无需任何扩展即可,但是我强烈建议将其与puppeteer-extra
一起使用puppeteer-extra-plugin-stealth
,以防止您使用无头铬或正在使用web driver的网站检测。您可以在Chrome headless tests website上检查它。下面的屏幕截图显示了差异。
Process
首先,我们需要滚动所有书籍列表,直到没有更多的清单加载,这是下面描述的困难部分。
下一步是在滚动完成后从HTML元素中提取数据。通过SelectorGadget Chrome extension,获得合适的CSS选择器的过程非常容易,该过程能够通过单击浏览器中的所需元素来获取CSS选择器。但是,它并不总是完美地工作,尤其是当JavaScript大量使用该网站时。
如果您想了解更多有关它们的信息,我们在Serpapi上有专门的Web Scraping with CSS Selectors博客文章。
下面的GIF说明了使用Selectorgadget选择结果的不同部分的方法。
代码说明
声明koude1从puppeteer-extra
Library和koude11控制Chromium浏览器,以防止网站检测到您正在使用puppeteer-extra-plugin-stealth
库中使用web driver:
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
接下来,我们“说” puppeteer
使用StealthPlugin
,编写必要的请求参数,并使用ternary operator搜索URL(URL可能会有所不同,具体取决于是否指定了类别):
const searchParams = {
hl: "en", // Parameter defines the language to use for the Google search
gl: "us", // parameter defines the country to use for the Google search
device: "phone", // parameter defines the search device. Options: phone, tablet, tv, chromebook
category: null, // you can see the full list of supported categories on https://serpapi.com/google-play-books-categories
};
const URL = searchParams.category
? `https://play.google.com/store/books/category/${searchParams.category}?hl=${searchParams.hl}&gl=${searchParams.gl}&device=${searchParams.device}`
: `https://play.google.com/store/books?hl=${searchParams.hl}&gl=${searchParams.gl}&device=${searchParams.device}`;
如果将category
参数设置为null,则意味着我们使用默认类别(eBooks),而URL将看起来像这样:
"https://play.google.com/store/books?hl=en&gl=US";
否则,URL看起来像这样:
"https://play.google.com/store/books/category/coll_1665?hl=en&gl=US";
The full list of supported categories看起来像:
{
"coll_1665": {
"section": "Books",
"html_structure": "rows",
"category": "Arts & entertainment"
},
"subj_Art___Humor.AH_Art": {
"section": "Books",
"html_structure": "list",
"category": "Arts & entertainment",
"subcategory": "Art"
},
"subj_Art___Humor.AH_Drama": {
"section": "Books",
"html_structure": "list",
"category": "Arts & entertainment",
"subcategory": "Drama"
},
"subj_Art___Humor.AH_Humor": {
"section": "Books",
"html_structure": "list",
"category": "Arts & entertainment",
"subcategory": "Humor"
},
"subj_Art___Humor.AH_Music": {
"section": "Books",
"html_structure": "list",
"category": "Arts & entertainment",
"subcategory": "Music"
},
...and other categories
}
接下来,我们编写一个函数以滚动页面以加载所有文章:
async function scrollPage(page, scrollContainer) {
...
}
在此功能中,首先,我们需要获得scrollContainer
高度(使用koude17方法)。然后,我们使用while
循环,在其中滚动scrollContainer
,等待2秒(使用koude20方法),并获得一个新的scrollContainer
高度。
接下来,我们检查newHeight
是否等于lastHeight
我们停止循环。否则,我们将newHeight
值定义为lastHeight
变量,然后重复重复直到页面不滚动到末尾:
let lastHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
while (true) {
await page.evaluate(`window.scrollTo(0, document.querySelector("${scrollContainer}").scrollHeight)`);
await page.waitForTimeout(4000);
let newHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
if (newHeight === lastHeight) {
break;
}
lastHeight = newHeight;
}
接下来,我们编写一个函数以从页面获取书籍数据:
async function getBooksFromPage(page) {
...
}
在此功能中,我们从页面上下文中获取信息并将其保存在返回的对象中。接下来,我们需要使用"section .oVnAB"
选择器(koude27方法)获取所有HTML元素。然后,我们使用koude28方法(允许用结果使对象)迭代使用koude29方法构建的数组:
const books = await page.evaluate(() => {
const mainPageInfo = Array.from(document.querySelectorAll("section .oVnAB")).reduce((result, block) => {
...
}, {});
return mainPageInfo;
});
return books;
最后,我们需要获得categoryTitle
和title
,link
,rating
,originalPrice
,originalPrice
,price
,price
,thumbnail
和bookId
(我们可以使用koude39和koude40的koude40和koude40方法从所选类别(abiaoqian abiaoqian koude27)(koude40方法)从link
剪切(koude27) koude43,koude44和koude45方法。
在每个ITARATION步骤中,我们返回上一个步骤结果(使用koude46),并添加带有categoryTitle
常数名称的新类别:
const categoryTitle = block.querySelector(".kcen6d").textContent.trim();
const books = Array.from(block.parentElement.querySelectorAll(".ULeU3b")).map((book) => {
const link = `https://play.google.com${book.querySelector(".Si6A0c")?.getAttribute("href")}`;
const bookId = link.slice(link.indexOf("?id=") + 4);
return {
title: book.querySelector(".hP61id div:first-child")?.getAttribute("title"),
link,
rating: parseFloat(book.querySelector(".LrNMN[aria-label]")?.getAttribute("aria-label").slice(6, 9)) || "No rating",
originalPrice: book.querySelector(".LrNMN .SUZt4c")?.textContent.trim(),
price: book.querySelector(".LrNMN .VfPpfd")?.textContent.trim(),
thumbnail: book.querySelector(".TjRVLb img")?.getAttribute("srcset").slice(0, -3),
bookId,
};
});
return {
...result,
[categoryTitle]: books,
};
接下来,编写一个函数来控制浏览器并获取信息:
async function getMainPageInfo() {
...
}
首先,在此功能中,我们需要使用带有当前options
的puppeteer.launch({options})
方法来定义browser
,例如headless: true
和args: ["--no-sandbox", "--disable-setuid-sandbox"]
。
这些选项意味着我们将headless模式和数组与arguments一起使用,我们用来允许在线IDE中启动浏览器流程。然后我们打开一个新的page
:
const browser = await puppeteer.launch({
headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
接下来,我们更改默认值(30 sec)等待选择器的时间到60000毫秒(1分钟)与koude54方法缓慢连接,请使用koude56方法访问URL
,并使用koude57方法来等待等待,直到选择器加载:< br>
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector(".oVnAB");
最后,我们等到页面滚动,从books
常数中保存书籍数据,关闭浏览器,然后返回收到的数据:
await scrollPage(page, ".T4LgNb");
const books = await getBooksFromPage(page);
await browser.close();
return books;
现在我们可以启动我们的解析器:
$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file
输出
{
"Start a new series":[
{
"title":"Magic Lessons: The Prequel to Practical Magic",
"link":"https://play.google.com/store/books/details/Alice_Hoffman_Magic_Lessons?id=sejNDwAAQBAJ",
"rating":4.7,
"price":"$12.99",
"thumbnail":"https://books.google.com/books/publisher/content/images/frontcover/sejNDwAAQBAJ?fife=w512-h512",
"bookId":"sejNDwAAQBAJ"
},
... and other results
],
"New to rent":[
{
"title":"The Forever War",
"link":"https://play.google.com/store/books/details/Joe_Haldeman_The_Forever_War?id=SUFOBQAAQBAJ",
"rating":4.5,
"originalPrice":"$2.99",
"price":"$2.69",
"thumbnail":"https://books.google.com/books/publisher/content/images/frontcover/SUFOBQAAQBAJ?fife=w512-h512",
"bookId":"SUFOBQAAQBAJ"
},
... and other results
],
... and other categories
}
usuingaoqian41 from serpapi
本节是为了显示DIY解决方案与我们的解决方案之间的比较。
最大的区别是您不需要从头开始创建解析器并维护它。
也有可能在Google的某个时候阻止请求,我们在后端处理它,因此无需弄清楚如何自己做或弄清楚要使用哪个验证码,代理提供商。 p>
首先,我们需要安装koude59:
npm i google-search-results-nodejs
这是full code example,如果您不需要说明:
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com
const params = {
engine: "google_play", // search engine
gl: "us", // parameter defines the country to use for the Google search
hl: "en", // parameter defines the language to use for the Google search
store: "books", // parameter defines the type of Google Play store
store_device: "phone", // parameter defines the search device. Options: phone, tablet, tv, chromebook, watch, car
// if you need to find books from one of the categories you need to uncomment the "books_category" parameter
// books_category: "audiobooks", // you can see the full list of supported categories on https://serpapi.com/google-play-books-categories
};
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
const getResults = async () => {
const json = await getJson();
const booksResults = json.organic_results.reduce((result, category) => {
const { title: categoryTitle, items } = category;
const books = items.map((book) => {
const { title, link, rating = "No rating", original_price, price, thumbnail, product_id } = book;
const returnedBook = {
title,
link,
rating,
price,
thumbnail,
movieId: product_id,
};
if (original_price) returnedBook.originalPrice = original_price;
return returnedBook;
});
return {
...result,
[categoryTitle]: books,
};
}, {});
return booksResults;
};
getResults().then((result) => console.dir(result, { depth: null }));
代码说明
首先,我们需要从koude59库中声明SerpApi
并使用SerpApi的API键定义新的search
实例:
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);
接下来,我们编写要提出请求的必要参数(如果要设置搜索类别,则需要删除books_category
参数):
const params = {
engine: "google_play", // search engine
gl: "us", // parameter defines the country to use for the Google search
hl: "en", // parameter defines the language to use for the Google search
store: "books", // parameter defines the type of Google Play store
store_device: "phone", // parameter defines the search device. Options: phone, tablet, tv, chromebook, watch, car
// if you need to find books from one of the categories you need to uncomment the "books_category" parameter
// books_category: "audiobooks", // you can see the full list of supported categories on https://serpapi.com/google-play-books-categories
};
The full list of supported categories看起来像:
{
"coll_1665": {
"section": "Books",
"html_structure": "rows",
"category": "Arts & entertainment"
},
"subj_Art___Humor.AH_Art": {
"section": "Books",
"html_structure": "list",
"category": "Arts & entertainment",
"subcategory": "Art"
},
"subj_Art___Humor.AH_Drama": {
"section": "Books",
"html_structure": "list",
"category": "Arts & entertainment",
"subcategory": "Drama"
},
"subj_Art___Humor.AH_Humor": {
"section": "Books",
"html_structure": "list",
"category": "Arts & entertainment",
"subcategory": "Humor"
},
"subj_Art___Humor.AH_Music": {
"section": "Books",
"html_structure": "list",
"category": "Arts & entertainment",
"subcategory": "Music"
},
...and other categories
}
接下来,我们从Serpapi库中包装搜索方法,以便进一步处理搜索结果:
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
最后,我们声明了从页面获取数据并返回的函数getResult
:
const getResults = async () => {
...
};
首先,在此功能中,我们获得了带有结果的json
,然后我们需要在接收到的json
中迭代organic_results
数组。为此,我们使用koude28方法(它允许用结果使对象)。在每个ITARATION步骤中,我们返回上一个步骤结果(使用koude46),并添加带有categoryTitle
常数名称的新类别:
const json = await getJson();
const booksResults = json.organic_results.reduce((result, category) => {
...
return {
...result,
[categoryTitle]: books,
};
}, {});
return booksResults;
接下来,我们破坏了category
元素,将title
重新定义为categoryTitle
恒定,然后itarate items
阵列以从该类别中获取所有书籍。为此,我们需要破坏book
元素,为rating
设置默认值“ no评级”并返回此常数:
const { title: categoryTitle, items } = category;
const books = items.map((book) => {
const { title, link, rating = "No rating", original_price, price, thumbnail, product_id } = book;
const returnedBook = {
title,
link,
rating,
price,
thumbnail,
movieId: product_id,
};
if (original_price) returnedBook.originalPrice = original_price;
return returnedBook;
});
之后,我们运行getResults
函数,并使用koude78方法在控制台中打印所有接收的信息,该方法允许您使用带有必要参数的对象来更改默认输出选项:
getResults().then((result) => console.dir(result, { depth: null }));
输出
{
"New releases":[
{
"title":"The Golden Enclaves: A Novel",
"link":"https://play.google.com/store/books/details/Naomi_Novik_The_Golden_Enclaves?id=7qBSEAAAQBAJ",
"rating":4.7,
"price":"$13.99",
"thumbnail":"https://books.google.com/books/publisher/content/images/frontcover/7qBSEAAAQBAJ?fife=w256-h256",
"movieId":"7qBSEAAAQBAJ"
},
... and other results
],
"Advice for a better life":[
{
"title":"How to Stop Feeling Like Sh*t: 14 Habits that Are Holding You Back from Happiness",
"link":"https://play.google.com/store/books/details/Andrea_Owen_How_to_Stop_Feeling_Like_Sh_t?id=ekfiDQAAQBAJ",
"rating":4.4,
"price":"$11.99",
"thumbnail":"https://books.google.com/books/publisher/content/images/frontcover/ekfiDQAAQBAJ?fife=w256-h256",
"movieId":"ekfiDQAAQBAJ"
},
... and other results
],
... and other categories
}
链接
如果您想查看一些用serpapi制定的项目,write me a message。
添加Feature Requestð«或Bugð