将被刮擦
完整代码
如果您不需要解释,请看一下the full code example in the online IDE
const cheerio = require("cheerio");
const axios = require("axios");
const productId = "8757849604759505625"; // Parameter defines the ID of a product you want to get the results for
const AXIOS_OPTIONS = {
headers: {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
}, // adding the User-Agent header as one way to prevent the request from being blocked
params: {
hl: "en", // parameter defines the language to use for the Google search
gl: "us", // parameter defines the country to use for the Google search
},
};
function getSellersInfo() {
return axios.get(`https://www.google.com/shopping/product/${productId}/offers`, AXIOS_OPTIONS).then(function ({ data }) {
let $ = cheerio.load(data);
return {
title: $(".BvQan")?.text().trim(),
reviews: parseInt($(".HiT7Id > span")?.attr("aria-label")?.replace(",", "") || 0),
rating: parseFloat($(".UzThIf")?.attr("aria-label")),
onlineSellers: Array.from($(".sh-osd__offer-row")).map((el) => ({
name: $(el).find(".b5ycib")?.text().trim() || $(el).find(".kjM2Bf")?.text().trim(),
link: `https://www.google.com${$(el).find(".b5ycib")?.attr("href") || $(el).find(".pCKrrc > a")?.attr("href")}`,
basePrice: $(el).find(".g9WBQb")?.text().trim(),
additionalPrice: {
shipping: $(el).find(".SuutWb tr:nth-child(2) td:last-child")?.text().trim(),
tax: $(el).find(".SuutWb tr:nth-child(3) td:last-child")?.text().trim(),
},
totalPrice: $(el).find(".SuutWb tr:last-child td:last-child")?.text().trim(),
condition: $(el).find(".Yy9sbf")?.text().trim() || "New",
})),
reletedProducts: Array.from($(".xyjbB")).map((el) => ({
title: $(el).find(".YTkbnd")?.text().trim(),
link: `https://www.google.com${$(el).find(".YTkbnd")?.attr("href")}`,
price: $(el).find(".vzbr7d")?.text().trim(),
reviews: parseInt($(el).find(".HiT7Id span")?.attr("aria-label")?.replace(",", "")) || "No reviews",
rating: parseFloat($(el).find(".UzThIf")?.attr("aria-label")) || "No rating",
})),
};
});
}
getSellersInfo().then((result) => console.dir(result, { depth: null }));
准备
首先,我们需要创建一个node.js* project,然后添加koude0软件包koude1将koude1和koude2和koude2添加到网站上。
。为此,在我们项目的目录中,打开命令行并输入:
$ npm init -y
,然后:
$ npm i cheerio axios
*如果您没有安装node.js,则可以download it from nodejs.org并遵循安装documentation。
Process
首先,我们需要从HTML元素中提取数据。通过SelectorGadget Chrome extension,获得合适的CSS选择器的过程非常容易,通过单击浏览器中的所需元素,我们能够获取CSS选择器。但是,它并不总是完美地工作,尤其是当JavaScript大量使用该网站时。
如果您想了解更多有关它们的信息,我们在Serpapi上有专门的web Scraping with CSS Selectors博客文章。
下面的GIF说明了选择结果不同部分的方法。
代码说明
const cheerio = require("cheerio");
const axios = require("axios");
接下来,我们编写产品ID,请求选项:带有koude6的koude5,用于用作“真实”用户访问,以及提出请求的必要参数:
const productId = "8757849604759505625"; // Parameter defines the ID of a product you want to get the results for
const AXIOS_OPTIONS = {
headers: {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36",
}, // adding the User-Agent header as one way to prevent the request from being blocked
params: {
hl: "en", // parameter defines the language to use for the Google search
gl: "us", // parameter defines the country to use for the Google search
},
};
ð注意:Default koude2 request user-agent is koude8因此,网站了解这是一个发送请求并可能阻止其的脚本。 Check what's your user-agent。
接下来,我们编写一个函数,该函数使请求并返回接收到的数据。我们收到了koude2请求的响应,该请求具有我们destructured的data
键,并用koude1对其进行解析:
function getSellersInfo() {
return axios
.get(`https://www.google.com/shopping/product/${productId}/offers`, AXIOS_OPTIONS)
.then(function ({ data }) {
let $ = cheerio.load(data);
...
})
}
接下来,我们需要使用下一个方法获取页面的不同部分:
title: $(".BvQan")?.text().trim(),
reviews: parseInt($(".HiT7Id > span")?.attr("aria-label")?.replace(",", "") || 0),
rating: parseFloat($(".UzThIf")?.attr("aria-label")),
onlineSellers: Array.from($(".sh-osd__offer-row")).map((el) => ({
name:
$(el).find(".b5ycib")?.text().trim() ||
$(el).find(".kjM2Bf")?.text().trim(),
link:
`https://www.google.com${$(el).find(".b5ycib")?.attr("href") ||
$(el).find(".pCKrrc > a")?.attr("href")}`,
basePrice: $(el).find(".g9WBQb")?.text().trim(),
additionalPrice: {
shipping: $(el).find(".SuutWb tr:nth-child(2) td:last-child")?.text().trim(),
tax: $(el).find(".SuutWb tr:nth-child(3) td:last-child")?.text().trim(),
},
totalPrice: $(el).find(".SuutWb tr:last-child td:last-child")?.text().trim(),
condition: $(el).find(".Yy9sbf")?.text().trim() || "New",
})),
reletedProducts: Array.from($(".xyjbB")).map((el) => ({
title: $(el).find(".YTkbnd")?.text().trim(),
link: `https://www.google.com${$(el).find(".YTkbnd")?.attr("href")}`,
price: $(el).find(".vzbr7d")?.text().trim(),
reviews:
parseInt($(el).find(".HiT7Id span")?.attr("aria-label")?.replace(",", "")) ||
"No reviews",
rating: parseFloat($(el).find(".UzThIf")?.attr("aria-label")) || "No rating",
})),
现在我们可以启动我们的解析器:
$ node YOUR_FILE_NAME # YOUR_FILE_NAME is the name of your .js file
输出
{
"title":"Apple iPhone 14 Pro Max - 128 GB - Space Black - Unlocked",
"reviews":748,
"rating":4.5,
"onlineSellers":[
{
"name":"AppleOpens in a new window",
"link":"https://www.google.com/url?q=https://www.apple.com/us/shop/go/product/MQ8N3%3Fcppart%3DUNLOCKED%26cid%3Daos-us-seo-pla&sa=U&ved=0ahUKEwj18pvWgp_7AhW6BjQIHasACEEQ2ykIJQ&usg=AOvVaw22XYRR7KYv5JrrFHZOKXPK",
"basePrice":"$1,099.00",
"additionalPrice":{
"shipping":"$0.00",
"tax":"$97.54"
},
"totalPrice":"$1,196.54",
"condition":"New"
},
... and other sellers
],
"reletedProducts":[
{
"title":"iPhone 13 Pro Max 128GB Sierra ...",
"link":"https://www.google.com/shopping/product/10665434407022887951?hl=en&gl=us&prds=epd:17054172175953313994,oid:17054172175953313994,pid:8842852891481692870,rsk:PC_8217023720749633348&sa=X&ved=0ahUKEwj18pvWgp_7AhW6BjQIHasACEEQrRIIbQ",
"price":"$0.00",
"reviews":11327,
"rating":4.5
},
... and other products
]
}
usuingaoqian2from serpapi
本节是为了显示DIY解决方案与我们的解决方案之间的比较。
最大的区别是您不需要从头开始创建解析器并维护它。
也有可能在Google的某个时候阻止请求,我们在后端处理它,因此无需弄清楚如何自己做或弄清楚要使用哪个验证码,代理提供商。 p>
首先,我们需要安装koude20:
npm i google-search-results-nodejs
这是full code example,如果您不需要说明:
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(process.env.API_KEY); //your API key from serpapi.com
const params = {
product_id: "8757849604759505625", // Parameter defines the ID of a product you want to get the results for.
engine: "google_product", // search engine
device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
hl: "en", // parameter defines the language to use for the Google search
gl: "us", // parameter defines the country to use for the Google search
offers: true, // parameter for fetching offers results
};
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
const getResults = async () => {
const json = await getJson();
return {
...json.product_results,
onlineSellers: json.sellers_results?.online_sellers,
reletedProducts: json.related_products?.different_brand
};
};
getResults().then((result) => console.dir(result, { depth: null }));
代码说明
首先,我们需要从koude20库中声明SerpApi
,并使用SerpApi的API键定义新的search
实例:
const SerpApi = require("google-search-results-nodejs");
const search = new SerpApi.GoogleSearch(API_KEY);
接下来,我们为提出请求的必要参数编写:
const params = {
product_id: "8757849604759505625", // Parameter defines the ID of a product you want to get the results for.
engine: "google_product", // search engine
device: "desktop", //Parameter defines the device to use to get the results. It can be set to "desktop" (default), "tablet", or "mobile"
hl: "en", // parameter defines the language to use for the Google search
gl: "us", // parameter defines the country to use for the Google search
offers: true, // parameter for fetching offers results
};
接下来,我们从Serpapi库中包装搜索方法,以便进一步处理搜索结果:
const getJson = () => {
return new Promise((resolve) => {
search.json(params, resolve);
});
};
最后,我们声明了从页面获取数据并返回的函数getResult
:
const getResults = async () => {
...
};
在此功能中,我们获得了带有结果的json
,并使用koude27的json
数据返回对象:
const json = await getJson();
return {
...json.product_results,
onlineSellers: json.sellers_results?.online_sellers,
reletedProducts: json.related_products?.different_brand
};
之后,我们运行getResults
函数并使用koude29方法在控制台中打印所有接收的信息,该方法允许您使用带有必要参数的对象来更改默认输出选项:
getResults().then((result) => console.dir(result, { depth: null }));
输出
{
"product_id":8757849604759506000,
"title":"Apple iPhone 14 Pro Max - 128 GB - Space Black - Unlocked",
"reviews":748,
"rating":4.5,
"onlineSellers":[
{
"position":1,
"name":"Apple",
"link":"https://www.google.com/url?q=https://www.apple.com/us/shop/go/product/MQ8N3%3Fcppart%3DUNLOCKED%26cid%3Daos-us-seo-pla&sa=U&ved=0ahUKEwiMl-6jip_7AhUZLUQIHTI4DPoQ2ykIJQ&usg=AOvVaw1NkUFFfa7AWk6BcJQut1jp",
"base_price":"$1,099.00",
"additional_price":{
"shipping":"$0.00",
"tax":"$85.17"
},
"total_price":"$1,184.17"
},
... and other sellers
],
"reletedProducts":[
{
"title":"iPhone 13 Pro Max 128GB Sierra ...",
"link":"https://www.google.com/shopping/product/10665434407022887951?hl=en&gl=us&ie=UTF-8&prds=epd:17054172175953313994,oid:17054172175953313994,pid:8842852891481692870,rsk:PC_8217023720749633348&sa=X&ved=0ahUKEwiMl-6jip_7AhUZLUQIHTI4DPoQrhIIaA",
"price":"$0.00"
},
... and other products
]
}
链接
如果您想查看一些用serpapi制定的项目,write me a message。
添加一个Feature Requestð«或Bugð