网络用nodejs刮擦Google Play书籍-DEV365 开发者社区

将被刮擦

完整代码

如果您不需要解释，请看一下the full code example in the online IDE

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

puppeteer.use(StealthPlugin());

const searchParams = {
  hl: "en", // Parameter defines the language to use for the Google search
  gl: "us", // parameter defines the country to use for the Google search
  device: "phone", // parameter defines the search device. Options: phone, tablet, tv, chromebook
  category: null, // you can see the full list of supported categories on https://serpapi.com/google-play-books-categories
};

const URL = searchParams.category
  ? `https://play.google.com/store/books/category/${searchParams.category}?hl=${searchParams.hl}&gl=${searchParams.gl}&device=${searchParams.device}`
  : `https://play.google.com/store/books?hl=${searchParams.hl}&gl=${searchParams.gl}&device=${searchParams.device}`;

async function scrollPage(page, scrollContainer) {
  let lastHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
  while (true) {
    await page.evaluate(`window.scrollTo(0, document.querySelector("${scrollContainer}").scrollHeight)`);
    await page.waitForTimeout(4000);
    let newHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
    if (newHeight === lastHeight) {
      break;
    }
    lastHeight = newHeight;
  }
}

async function getBooksFromPage(page) {
  const books = await page.evaluate(() => {
    const mainPageInfo = Array.from(document.querySelectorAll("section .oVnAB")).reduce((result, block) => {
      const categoryTitle = block.querySelector(".kcen6d").textContent.trim();
      const books = Array.from(block.parentElement.querySelectorAll(".ULeU3b")).map((book) => {
        const link = `https://play.google.com${book.querySelector(".Si6A0c")?.getAttribute("href")}`;
        const bookId = link.slice(link.indexOf("?id=") + 4);
        return {
          title: book.querySelector(".hP61id div:first-child")?.getAttribute("title"),
          link,
          rating: parseFloat(book.querySelector(".LrNMN[aria-label]")?.getAttribute("aria-label").slice(6, 9)) || "No rating",
          originalPrice: book.querySelector(".LrNMN .SUZt4c")?.textContent.trim(),
          price: book.querySelector(".LrNMN .VfPpfd")?.textContent.trim(),
          thumbnail: book.querySelector(".TjRVLb img")?.getAttribute("srcset").slice(0, -3),
          bookId,
        };
      });
      return {
        ...result,
        [categoryTitle]: books,
      };
    }, {});

    return mainPageInfo;
  });
  return books;
}

async function getMainPageInfo() {
  const browser = await puppeteer.launch({
    headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
    args: ["--no-sandbox", "--disable-setuid-sandbox"],
  });

  const page = await browser.newPage();

  await page.setDefaultNavigationTimeout(60000);
  await page.goto(URL);

  await page.waitForSelector(".oVnAB");

  await scrollPage(page, ".T4LgNb");

  const books = await getBooksFromPage(page);

  await browser.close();

  return books;
}

getMainPageInfo().then((result) => console.dir(result, { depth: null }));

准备

首先，我们需要创建一个node.js* project并添加koude0包koude1，koude2和koude3以控制铬（或chrome或firefox，但现在我们仅在DevTools Protocol上使用铬在headless或无头模式中。

在使用我们的项目的目录中，打开命令行并输入npm init -y，然后是npm i puppeteer puppeteer-extra puppeteer-extra-plugin-stealth。

*如果您没有安装node.js，则可以download it from nodejs.org并遵循安装documentation。

ð注意：另外，您可以使用puppeteer无需任何扩展即可，但是我强烈建议将其与puppeteer-extra一起使用puppeteer-extra-plugin-stealth，以防止您使用无头铬或正在使用web driver的网站检测。您可以在Chrome headless tests website上检查它。下面的屏幕截图显示了差异。

Process

首先，我们需要滚动所有书籍列表，直到没有更多的清单加载，这是下面描述的困难部分。

下一步是在滚动完成后从HTML元素中提取数据。通过SelectorGadget Chrome extension，获得合适的CSS选择器的过程非常容易，该过程能够通过单击浏览器中的所需元素来获取CSS选择器。但是，它并不总是完美地工作，尤其是当JavaScript大量使用该网站时。

如果您想了解更多有关它们的信息，我们在Serpapi上有专门的Web Scraping with CSS Selectors博客文章。

下面的GIF说明了使用Selectorgadget选择结果的不同部分的方法。

代码说明

声明koude1从puppeteer-extra Library和koude11控制Chromium浏览器，以防止网站检测到您正在使用puppeteer-extra-plugin-stealth库中使用web driver：

const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");

接下来，我们“说” puppeteer使用StealthPlugin，编写必要的请求参数，并使用ternary operator搜索URL（URL可能会有所不同，具体取决于是否指定了类别）：

const searchParams = {
  hl: "en", // Parameter defines the language to use for the Google search
  gl: "us", // parameter defines the country to use for the Google search
  device: "phone", // parameter defines the search device. Options: phone, tablet, tv, chromebook
  category: null, // you can see the full list of supported categories on https://serpapi.com/google-play-books-categories
};

const URL = searchParams.category
  ? `https://play.google.com/store/books/category/${searchParams.category}?hl=${searchParams.hl}&gl=${searchParams.gl}&device=${searchParams.device}`
  : `https://play.google.com/store/books?hl=${searchParams.hl}&gl=${searchParams.gl}&device=${searchParams.device}`;

如果将category参数设置为null，则意味着我们使用默认类别（eBooks），而URL将看起来像这样：

"https://play.google.com/store/books?hl=en&gl=US";

否则，URL看起来像这样：

"https://play.google.com/store/books/category/coll_1665?hl=en&gl=US";

仅在需要应用类别时才更改URL：

The full list of supported categories看起来像：

{
  "coll_1665": {
    "section": "Books",
    "html_structure": "rows",
    "category": "Arts & entertainment"
  },
  "subj_Art___Humor.AH_Art": {
    "section": "Books",
    "html_structure": "list",
    "category": "Arts & entertainment",
    "subcategory": "Art"
  },
  "subj_Art___Humor.AH_Drama": {
    "section": "Books",
    "html_structure": "list",
    "category": "Arts & entertainment",
    "subcategory": "Drama"
  },
  "subj_Art___Humor.AH_Humor": {
    "section": "Books",
    "html_structure": "list",
    "category": "Arts & entertainment",
    "subcategory": "Humor"
  },
  "subj_Art___Humor.AH_Music": {
    "section": "Books",
    "html_structure": "list",
    "category": "Arts & entertainment",
    "subcategory": "Music"
  },
  ...and other categories
}

接下来，我们编写一个函数以滚动页面以加载所有文章：

async function scrollPage(page, scrollContainer) {
  ...
}

在此功能中，首先，我们需要获得scrollContainer高度（使用koude17方法）。然后，我们使用while循环，在其中滚动scrollContainer，等待2秒（使用koude20方法），并获得一个新的scrollContainer高度。

接下来，我们检查newHeight是否等于lastHeight我们停止循环。否则，我们将newHeight值定义为lastHeight变量，然后重复重复直到页面不滚动到末尾：

let lastHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
while (true) {
  await page.evaluate(`window.scrollTo(0, document.querySelector("${scrollContainer}").scrollHeight)`);
  await page.waitForTimeout(4000);
  let newHeight = await page.evaluate(`document.querySelector("${scrollContainer}").scrollHeight`);
  if (newHeight === lastHeight) {
    break;
  }
  lastHeight = newHeight;
}

接下来，我们编写一个函数以从页面获取书籍数据：

async function getBooksFromPage(page) {
  ...
}

在此功能中，我们从页面上下文中获取信息并将其保存在返回的对象中。接下来，我们需要使用"section .oVnAB"选择器（koude27方法）获取所有HTML元素。然后，我们使用koude28方法（允许用结果使对象）迭代使用koude29方法构建的数组：

const books = await page.evaluate(() => {
  const mainPageInfo = Array.from(document.querySelectorAll("section .oVnAB")).reduce((result, block) => {
      ...
    }, {});

    return mainPageInfo;
});
return books;

最后，我们需要获得categoryTitle和title，link，rating，originalPrice，originalPrice，price，price，thumbnail和bookId（我们可以使用koude39和koude40的koude40和koude40方法从所选类别（abiaoqian abiaoqian koude27）（koude40方法）从link剪切（koude27） koude43，koude44和koude45方法。

在每个ITARATION步骤中，我们返回上一个步骤结果（使用koude46），并添加带有categoryTitle常数名称的新类别：

const categoryTitle = block.querySelector(".kcen6d").textContent.trim();
const books = Array.from(block.parentElement.querySelectorAll(".ULeU3b")).map((book) => {
  const link = `https://play.google.com${book.querySelector(".Si6A0c")?.getAttribute("href")}`;
  const bookId = link.slice(link.indexOf("?id=") + 4);
  return {
    title: book.querySelector(".hP61id div:first-child")?.getAttribute("title"),
    link,
    rating: parseFloat(book.querySelector(".LrNMN[aria-label]")?.getAttribute("aria-label").slice(6, 9)) || "No rating",
    originalPrice: book.querySelector(".LrNMN .SUZt4c")?.textContent.trim(),
    price: book.querySelector(".LrNMN .VfPpfd")?.textContent.trim(),
    thumbnail: book.querySelector(".TjRVLb img")?.getAttribute("srcset").slice(0, -3),
    bookId,
  };
});
return {
  ...result,
  [categoryTitle]: books,
};

接下来，编写一个函数来控制浏览器并获取信息：

async function getMainPageInfo() {
  ...
}

首先，在此功能中，我们需要使用带有当前options的puppeteer.launch({options})方法来定义browser，例如headless: true和args: ["--no-sandbox", "--disable-setuid-sandbox"]。

这些选项意味着我们将headless模式和数组与arguments一起使用，我们用来允许在线IDE中启动浏览器流程。然后我们打开一个新的page：

const browser = await puppeteer.launch({
  headless: true, // if you want to see what the browser is doing, you need to change this option to "false"
  args: ["--no-sandbox", "--disable-setuid-sandbox"],
});

const page = await browser.newPage();

接下来，我们更改默认值（30 sec）等待选择器的时间到60000毫秒（1分钟）与koude54方法缓慢连接，请使用koude56方法访问URL，并使用koude57方法来等待等待，直到选择器加载：< br>

await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector(".oVnAB");

最后，我们等到页面滚动，从books常数中保存书籍数据，关闭浏览器，然后返回收到的数据：

await scrollPage(page, ".T4LgNb");

const books = await getBooksFromPage(page);

await browser.close();

return books;

现在我们可以启动我们的解析器：

$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file

输出

{
   "Start a new series":[
      {
         "title":"Magic Lessons: The Prequel to Practical Magic",
         "link":"https://play.google.com/store/books/details/Alice_Hoffman_Magic_Lessons?id=sejNDwAAQBAJ",
         "rating":4.7,
         "price":"$12.99",
         "thumbnail":"https://books.google.com/books/publisher/content/images/frontcover/sejNDwAAQBAJ?fife=w512-h512",
         "bookId":"sejNDwAAQBAJ"
      },
      ... and other results
   ],
   "New to rent":[
      {
         "title":"The Forever War",
         "link":"https://play.google.com/store/books/details/Joe_Haldeman_The_Forever_War?id=SUFOBQAAQBAJ",
         "rating":4.5,
         "originalPrice":"$2.99",
         "price":"$2.69",
         "thumbnail":"https://books.google.com/books/publisher/content/images/frontcover/SUFOBQAAQBAJ?fife=w512-h512",
         "bookId":"SUFOBQAAQBAJ"
      },
      ... and other results
   ],
   ... and other categories
}

usuingaoqian41 from serpapi

本节是为了显示DIY解决方案与我们的解决方案之间的比较。

最大的区别是您不需要从头开始创建解析器并维护它。

也有可能在Google的某个时候阻止请求，我们在后端处理它，因此无需弄清楚如何自己做或弄清楚要使用哪个验证码，代理提供商。

首先，我们需要安装koude59：

npm i google-search-results-nodejs

这是full code example，如果您不需要说明：

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com

const params = {
  engine: "google_play", // search engine
  gl: "us", // parameter defines the country to use for the Google search
  hl: "en", // parameter defines the language to use for the Google search
  store: "books", // parameter defines the type of Google Play store
  store_device: "phone", // parameter defines the search device. Options: phone, tablet, tv, chromebook, watch, car
  // if you need to find books from one of the categories you need to uncomment the "books_category" parameter
  // books_category: "audiobooks", // you can see the full list of supported categories on https://serpapi.com/google-play-books-categories
};

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

const getResults = async () => {
  const json = await getJson();
  const booksResults = json.organic_results.reduce((result, category) => {
    const { title: categoryTitle, items } = category;
    const books = items.map((book) => {
      const { title, link, rating = "No rating", original_price, price, thumbnail, product_id } = book;
      const returnedBook = {
        title,
        link,
        rating,
        price,
        thumbnail,
        movieId: product_id,
      };
      if (original_price) returnedBook.originalPrice = original_price;
      return returnedBook;
    });
    return {
      ...result,
      [categoryTitle]: books,
    };
  }, {});
  return booksResults;
};

getResults().then((result) => console.dir(result, { depth: null }));

代码说明

首先，我们需要从koude59库中声明SerpApi并使用SerpApi的API键定义新的search实例：

const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);

接下来，我们编写要提出请求的必要参数（如果要设置搜索类别，则需要删除books_category参数）：

const params = {
  engine: "google_play", // search engine
  gl: "us", // parameter defines the country to use for the Google search
  hl: "en", // parameter defines the language to use for the Google search
  store: "books", // parameter defines the type of Google Play store
  store_device: "phone", // parameter defines the search device. Options: phone, tablet, tv, chromebook, watch, car
  // if you need to find books from one of the categories you need to uncomment the "books_category" parameter
  // books_category: "audiobooks", // you can see the full list of supported categories on https://serpapi.com/google-play-books-categories
};

The full list of supported categories看起来像：

{
  "coll_1665": {
    "section": "Books",
    "html_structure": "rows",
    "category": "Arts & entertainment"
  },
  "subj_Art___Humor.AH_Art": {
    "section": "Books",
    "html_structure": "list",
    "category": "Arts & entertainment",
    "subcategory": "Art"
  },
  "subj_Art___Humor.AH_Drama": {
    "section": "Books",
    "html_structure": "list",
    "category": "Arts & entertainment",
    "subcategory": "Drama"
  },
  "subj_Art___Humor.AH_Humor": {
    "section": "Books",
    "html_structure": "list",
    "category": "Arts & entertainment",
    "subcategory": "Humor"
  },
  "subj_Art___Humor.AH_Music": {
    "section": "Books",
    "html_structure": "list",
    "category": "Arts & entertainment",
    "subcategory": "Music"
  },
  ...and other categories
}

接下来，我们从Serpapi库中包装搜索方法，以便进一步处理搜索结果：

const getJson = () => {
  return new Promise((resolve) => {
    search.json(params, resolve);
  });
};

最后，我们声明了从页面获取数据并返回的函数getResult：

const getResults = async () => {
  ...
};

首先，在此功能中，我们获得了带有结果的json，然后我们需要在接收到的json中迭代organic_results数组。为此，我们使用koude28方法（它允许用结果使对象）。在每个ITARATION步骤中，我们返回上一个步骤结果（使用koude46），并添加带有categoryTitle常数名称的新类别：

  const json = await getJson();
  const booksResults = json.organic_results.reduce((result, category) => {
    ...
    return {
      ...result,
      [categoryTitle]: books,
    };
  }, {});
  return booksResults;

接下来，我们破坏了category元素，将title重新定义为categoryTitle恒定，然后itarate items阵列以从该类别中获取所有书籍。为此，我们需要破坏book元素，为rating设置默认值“ no评级”并返回此常数：

const { title: categoryTitle, items } = category;
const books = items.map((book) => {
  const { title, link, rating = "No rating", original_price, price, thumbnail, product_id } = book;
  const returnedBook = {
    title,
    link,
    rating,
    price,
    thumbnail,
    movieId: product_id,
  };
  if (original_price) returnedBook.originalPrice = original_price;
  return returnedBook;
});

之后，我们运行getResults函数，并使用koude78方法在控制台中打印所有接收的信息，该方法允许您使用带有必要参数的对象来更改默认输出选项：

getResults().then((result) => console.dir(result, { depth: null }));

输出

{
   "New releases":[
      {
         "title":"The Golden Enclaves: A Novel",
         "link":"https://play.google.com/store/books/details/Naomi_Novik_The_Golden_Enclaves?id=7qBSEAAAQBAJ",
         "rating":4.7,
         "price":"$13.99",
         "thumbnail":"https://books.google.com/books/publisher/content/images/frontcover/7qBSEAAAQBAJ?fife=w256-h256",
         "movieId":"7qBSEAAAQBAJ"
      },
        ... and other results
   ],
   "Advice for a better life":[
      {
         "title":"How to Stop Feeling Like Sh*t: 14 Habits that Are Holding You Back from Happiness",
         "link":"https://play.google.com/store/books/details/Andrea_Owen_How_to_Stop_Feeling_Like_Sh_t?id=ekfiDQAAQBAJ",
         "rating":4.4,
         "price":"$11.99",
         "thumbnail":"https://books.google.com/books/publisher/content/images/frontcover/ekfiDQAAQBAJ?fife=w256-h256",
         "movieId":"ekfiDQAAQBAJ"
      },
        ... and other results
   ],
   ... and other categories
}

链接

如果您想查看一些用serpapi制定的项目，write me a message。

加入我们的Twitter | YouTube

添加Feature Requestð«或Bugð