Vestiaire Collective是法国的豪华时尚转售平台。这是一个流行的网络刮擦目标,因为它是豪华时尚物品最大的二手市场之一。
在本教程中,我们将快速研究如何使用Python刮擦Vestiaire集体。在本指南中,我们将介绍:
- 刮擦Vestiaire集体产品清单数据。
- 使用Vestiaire Collective Sitemaps查找产品清单。
这是一个非常简单的刮板,因为我们将使用hidden web data scraping毫不费力地收集产品和卖方数据。
<! - kg-card-end:markdown-> <! - kg-card-begin:markdown->
为什么要刮擦Vestiaire集体?
Vestiaire Collective是奢侈品时装物品的主要交流。刮擦该网站的原因很有用:
- 奢侈品市场分析
- 竞争性分析
- 市场预测
有关网络刮擦用途的更多信息,请参见我们的web scraping use case hub。
<! - kg-card-end:markdown-> <! - kg-card-begin:markdown->
刮擦预览
我们将在Vestiaire Collective上删除整个产品数据集,其中包括:
- 产品详细信息,例如名称,描述和功能。
- 产品媒体(照片,视频)。
- 产品定价。
- 卖方详细信息。
这是一个示例数据集,我们将使用我们的Python刮刀收集:
示例产品数据集
{
"id": "32147447",
"type": "product",
"name": "Sweatshirt",
"price": {
"currency": "CAD",
"cents": 23033,
"formatted": "CDN$230.33"
},
"isLocal": true,
"description": "Worn once anine bing tiger sweatshirt sz M in excellent condition",
"likeCount": 3,
"path": "/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml",
"sold": false,
"reserved": false,
"negotiable": true,
"inStock": false,
"measurementFormatted": "Size: 8 US",
"receipt": false,
"available": true,
"consignment": false,
"prohibited": false,
"localizedDescription": "Worn once anine bing tiger sweatshirt sz M in excellent condition",
"originalDescription": "Worn once anine bing tiger sweatshirt sz M in excellent condition",
"originalDescriptionLanguage": "en",
"metadata": {
"title": "Sweatshirt Anine Bing Beige size 8 US in Cotton - 32147447",
"description": "Buy your sweatshirt Anine Bing on Vestiaire Collective, the luxury consignment store online. Second-hand Sweatshirt Anine Bing Beige in Cotton available. 32147447",
"keywords": "Anine Bing Cotton Knitwear"
},
"warehouse": {
"name": "Brooklyn",
"localizedName": "Brooklyn"
},
"pictures": [
{
"alt": "Sweatshirt Anine Bing",
"path": "32147447-1_2.jpg"
},
{
"alt": "Buy Anine Bing Sweatshirt online",
"path": "32147447-2_2.jpg"
},
{
"alt": "Luxury Anine Bing Knitwear Women ",
"path": "32147447-3_2.jpg"
},
{
"alt": "Second hand Clothing Women ",
"path": "32147447-4_2.jpg"
},
{
"alt": "Sweatshirt Anine Bing",
"path": "32147447-5_2.jpg"
}
],
"size": {
"id": "7",
"type": "size",
"size": "8",
"standard": "US",
"localizedStandard": "US"
},
"brand": {
"id": "5344",
"type": "brand",
"name": "Anine Bing",
"localizedName": "anine bing",
"url": {
"original": "http://vestiairecollective.com/anine-bing/",
"path": "/anine-bing/",
"url": "http://vestiairecollective.com/anine-bing/"
}
},
"material": {
"id": "2",
"type": "material",
"name": "Cotton",
"localizedName": "Cotton"
},
"color": {
"id": "2",
"type": "color",
"name": "Beige",
"localizedName": "Beige"
},
"condition": {
"id": "",
"type": "condition",
"description": "Very good condition"
},
"universe": {
"id": "1",
"type": "universe",
"name": "Women",
"localizedName": "Women"
},
"category": {
"id": "56",
"type": "category",
"name": "Knitwear",
"localizedName": "Knitwear",
"parent": {
"id": "2",
"type": "category",
"name": "Clothing",
"localizedName": "Clothing"
}
},
"subcategory": {
"id": "17",
"type": "subcategory",
"name": "Sweatshirts",
"localizedName": "Sweatshirts"
},
"season": {
"id": "3",
"type": "season",
"name": "All seasons",
"localizedName": "All seasons"
},
"model": {
"id": "0",
"type": "model",
"name": "",
"localizedName": ""
},
"seller": {
"id": "9797796",
"type": "user",
"firstname": "kate",
"username": "kate9797796",
"hyperwalletActive": false,
"alreadyDepositedAProduct": false,
"mood": "",
"country": "United States",
"countryISO": "US",
"civility": {
"name": "miss",
"localizedName": "miss",
"idGender": 3
},
"language": {
"name": "en",
"localizedName": "en",
"code": "en"
},
"hasWallet": false,
"badges": [
"recommended",
"direct-shipping",
"expert-seller"
],
"statistics": {
"productsWished": 0,
"productsSold": 126,
"productsListed": 585,
"productsBought": 0,
"passRate": 90,
"usuallyShipsWithin": "1-2 days"
},
"sellerRating": {
"badge": "Expert",
"goals": {
"conformity": 1,
"cx": 0,
"shipping": 0.93,
"volume": 32,
"tags": {
"volume": true,
"shipping": true,
"conformity": true
}
},
"goalsThresholds": [
{
"category": "volume",
"max_value": 5,
"thresholds": [
{
"label": "Trusted",
"value": 2
},
{
"label": "Expert",
"value": 5
}
]
},
{
"category": "conformity",
"max_value": 1,
"thresholds": [
{
"label": "Trusted",
"value": 0.8
},
{
"label": "Expert",
"value": 0.9
}
]
},
{
"category": "shipping",
"max_value": 1,
"thresholds": [
{
"label": "Trusted",
"value": 0.8
},
{
"label": "Expert",
"value": 0.9
}
]
}
],
"achievementsGoals": [
{
"category": "volume",
"achievements": [
{
"badge": "Trusted"
},
{
"badge": "Expert"
}
],
"tip": "Achieved"
},
{
"category": "conformity",
"achievements": [
{
"badge": "Trusted"
},
{
"badge": "Expert"
}
],
"tip": "Achieved"
},
{
"category": "shipping",
"achievements": [
{
"badge": "Trusted"
},
{
"badge": "Expert"
}
],
"tip": "Achieved"
}
]
},
"picture": {
"path": "/profil/missing_avatar.gif"
},
"social": {
"nbFollowers": 225,
"nbFollows": 7,
"productsLiked": 331,
"communityRank": 6914,
"followed": false
},
"vacation": {
"active": false
},
"segment": "C2C"
},
"creationDate": "2023-03-30T20:34:48Z",
"meshLinks": {
"topCategory": {
"name": "Women Clothing",
"localizedName": "Women Clothing",
"url": {
"url": "http://vestiairecollective.com//women-clothing/",
"path": "/women-clothing/"
}
},
"category": {
"name": "Knitwear",
"localizedName": "Knitwear",
"url": {
"url": "http://vestiairecollective.com//women-clothing/knitwear/",
"path": "/women-clothing/knitwear/"
}
},
"categoryBrand": {
"name": "Anine Bing Knitwear",
"localizedName": "Anine Bing Knitwear",
"url": {
"url": "http://vestiairecollective.com//women-clothing/knitwear/anine-bing/",
"path": "/women-clothing/knitwear/anine-bing/"
}
},
"categoryBrandModelMaterial": {
"name": "Anine Bing Cotton Knitwear",
"localizedName": "Anine Bing Cotton Knitwear",
"url": {
"url": "http://vestiairecollective.com//women-clothing/knitwear/anine-bing/cotton/",
"path": "/women-clothing/knitwear/anine-bing/cotton/"
}
}
},
"alternateVersions": [
{
"language": "de",
"path": "/damen-kleidung/pullover/anine-bing/beige-baumwolle-anine-bing-pullover-32147447.shtml"
},
{
"language": "x-default",
"path": "/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml"
},
{
"language": "us",
"path": "/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml"
},
{
"language": "en",
"path": "/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml"
},
{
"language": "es",
"path": "/mujer-ropa/jerseis-chalecos/anine-bing/jerseis-chalecos-anine-bing-de-algodon-beige-32147447.shtml"
},
{
"language": "fr",
"path": "/vetements-femme/pulls-gilets/anine-bing/pullgilet-anine-bing-en-coton-beige-32147447.shtml"
},
{
"language": "it",
"path": "/donna-abbigliamento/maglioni-gilet/anine-bing/maglioni-gilet-anine-bing-beige-cotone-32147447.shtml"
}
],
"shouldBeGone": false,
"indexation": {
"index": true,
"follow": true,
"crawlPagination": false
},
"buyerFees": [
{
"rateType": "FLAT",
"value": 2500,
"description": "",
"cost": {
"currency": "CAD",
"cents": 2500,
"formatted": "CDN$25"
}
}
],
"dutyAndTax": {
"currency": "CAD",
"cents": 0,
"formatted": "CDN$0"
},
"flags": [
"direct-shipping"
]
}
<! - kg-card-end:markdown-> <! - kg-card-begin:markdown->
设置
要刮擦这个目标,我们需要一些在Web刮擦中使用的Python软件包。由于我们将使用hidden web data scraping方法,我们需要的只是两个软件包:
可以使用Python的pip
台命令来安装这些软件包:
$ pip install httpx parsel
对于Scrapfly users,每个代码示例也有一个Scrapfly SDK版本。 SDK也可以使用pip
安装:
$ pip install "scrapfly-sdk[all]"
<! - kg-card-end:markdown-> <! - kg-card-begin:markdown->
刮擦产品数据
让我们开始查看单个产品页面,以及如何使用Python刮擦它。例如,让我们以此产品页面:
/beige-cotton-anine-bing-knitwear-32147447.shtml
我们可以使用CSS selectors或XPath分析页面HTML,但是由于Verstiaire Collective正在使用Next.js JavaScript框架,我们可以直接从页面源提取数据集:
我们可以通过检查页面源并寻找唯一的产品IDENFIER(例如名称或ID(CTRL+F))来找到这一点。在上面的示例中,我们可以看到它在<script id="__NEXT_DATA">
HTML元素下。
这称为隐藏的Web数据刮擦,这是一种从使用JavaScript框架(例如Next.js)的网站上刮下数据的非常简单有效的方法。要刮擦我们必须做的所有事情:
- 检索产品HTML页面。
- 使用CSS Selectors和
parsel
找到隐藏的JSON数据集。 - 使用
json.loads
加载json作为python词典。 - 选择产品字段。
在实用的python中,这看起来像这样:
python p>
刮擦
import asyncio
import json
import httpx
from parsel import Selector
# create HTTP client with defaults headers that look like a web browser and enable HTTP2 version
client = httpx.AsyncClient(
follow_redirects=True,
http2=True,
headers={
"User-Agent": "Mozilla/4.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=-1.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
},
)
def find_hidden_data(html) -> dict:
"""extract hidden web cache from page html"""
# use CSS selectors to find script tag with data
data = Selector(html).css("script# __NEXT_DATA__ ::text").get()
return json.loads(data)
async def scrape_product(url: str):
# retrieve page HTML
response = await client.get(url)
# find hidden web data
data = find_hidden_data(response.text)
# extract only product data from the page dataset
product = data['props']['pageProps']['product']
return product
# example scrape run:
print(asyncio.run(scrape_product("https://www.vestiairecollective.com/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml")))
import asyncio
import json
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
scrapfly = ScrapflyClient(key="YOUR SCRAPFLY KEY", max_concurrency=10)
def find_hidden_data(result: ScrapeApiResponse) -> dict:
"""extract hidden NEXT_DATA from page html"""
data = result.selector.css("script# __NEXT_DATA__ ::text").get()
data = json.loads(data)
return data
async def scrape_product(url: str) -> dict:
"""scrape a single stockx product page for product data"""
result = await scrapfly.async_scrape(ScrapeConfig(
url=url,
cache=True, # use cache while developing to speed up scraping for repeated script runs
asp=True, # Anti-Scraping Protection bypass allows to scrape protected pages
)
)
data = find_hidden_data(result)
product = data["props"]["pageProps"]["product"]
return product
# example run of 1 product scrape
print(asyncio.run(scrape_product("https://www.vestiairecollective.com/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml"))
在仅几行Python代码中,我们提取了整个产品数据集,其中包括所有产品详细信息和卖方信息!
接下来,让我们看一下如何使用Vestiaire Collective Sitemaps找到产品清单。
<! - kg-card-end:markdown-> <! - kg-card-begin:markdown->
寻找产品
Vestiaire Collective拥有广泛的站点套件,可用于查找所有产品清单。因此,要找到产品页面,我们将成为scraping sitemaps。
Vestiaire Collective Sitemaps可用:
/sitemaps/https_sitemap-en.xml
包含分为各种类别的站点地图,例如品牌,新列表,物品类型(服装,鞋子):
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<!-- sitemap url and category clues, this one is for brands -->
<loc>https://www.vestiairecollective.com/sitemaps/https_en-brands-1.xml</loc>
<!-- when the sitemap was updated -->
<lastmod>2023-04-07</lastmod>
</sitemap>
<sitemap>
<loc>https://www.vestiairecollective.com/sitemaps/https_en-new_items-1.xml</loc>
<lastmod>2023-04-07</lastmod>
</sitemap>
...
</sitemapindex>
这些站点中的每个站点都包含50 000个产品清单。
在我们的示例中,让我们刮擦可以在new_items.xml
站点地点上找到的最新列表。
new_items-1.xml
站点地图包含最新的50_000个项目。让我们看看如何刮擦它:
python p>
刮擦
import asyncio
import json
from typing import Dict, List
import httpx
from parsel import Selector
client = httpx.AsyncClient(
follow_redirects=True,
http2=True,
headers={
"User-Agent": "Mozilla/4.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=-1.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
},
)
def find_hidden_data(html) -> dict:
"""extract hidden web cache from page html"""
# use CSS selectors to find script tag with data
data = Selector(html).css("script# __NEXT_DATA__ ::text").get()
return json.loads(data)
async def scrape_product(url: str):
# retrieve page HTML
response = await client.get(url)
# catch products that are no longer available as they redirect to 308
for redirect in response.history:
if redirect.status_code == 308:
print(f"product {redirect.url} is no longer available")
return None
# find hidden web data
data = find_hidden_data(response.text)
# extract only product data from the page dataset
product = data["props"]["pageProps"]["product"]
return product
async def scrape_sitemap(url: str, max_pages: int = 100) -> List[Dict]:
"""Scrape Vestiaire Collective sitemap for products"""
# retrieve sitemap
print(f"scraping sitemap page: {url}")
response_sitemap = await client.get(url)
product_urls = Selector(response_sitemap.text).css("url>loc::text").getall()
print(f"found {len(product_urls)} products in the sitemap: {url}\n scraping the first {max_pages} products")
# scrape products concurrently using asyncio
product_scrapes = [asyncio.create_task(scrape_product(url)) for url in product_urls[:max_pages]]
return await asyncio.gather(*product_scrapes)
# example scrape run:
print(asyncio.run(scrape_sitemap("https://www.vestiairecollective.com/sitemaps/https_en-new_items-1.xml", max_pages=5)))
import asyncio
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
scrapfly = ScrapflyClient(key="YOUR SCRAPFLY KEY", max_concurrency=10)
async def scrape_sitemap(url: str, max_pages:int=100) -> List[Dict]:
"""Scrape Vestiaire Collective sitemap for products"""
print(f"scraping sitemap page: {url}")
result_sitemap = await scrapfly.async_scrape(ScrapeConfig(url=url, asp=True))
product_urls = result_sitemap.selector.css("url>loc::text").getall()
print(f"found {len(product_urls)} products in the sitemap: {url}\n scraping the first {max_pages} products")
product_pages = [ScrapeConfig(url=url, asp=True) for url in product_urls[:max_pages]]
products = []
async for result in scrapfly.concurrent_scrape(product_pages):
# Vestiaire Collective redirects to product category if product is no longer available (sold, deleted etc.)
if any(redirect['http_code'] == 308 for redirect in result.context['redirects']):
print(f"Product page {result.scrape_config.url} is no longer available")
continue
data = find_hidden_data(result)
products.append(data['props']['pageProps']['product'])
return products
# example scrape: scrape the first 10 newest listings
asyncio.run(scrape_sitemaps("https://www.vestiairecollective.com/sitemaps/https_en-new_items-1.xml", max_pages=10))
上面,我们使用parsel
使用了简单的XML解析来从新列表站点地图中提取URL。然后,我们像上一章一样刮擦每个产品的隐藏网络数据。
站点地图非常适合快速有效地找到刮擦目标。虽然为了进一步扩展我们的刮板,让我们看一下如何避免使用刮夹SDK阻止。
<! - kg-card-end:markdown-> <! - kg-card-begin:markdown->
避免用刮擦蝇阻塞
Vestiaire集体可能很难大规模刮擦,因为它使用Cloudflare anti-scraping service阻止了网络刮擦。因此,要扩大我们的刮板,我们需要使用代理或other tools to avoid scraper blocking或刮擦API。
刮擦API是扩展网络刮刀并避免被阻止的理想工具。这是我们在本指南中使用的工具的替换,并带有刮刀加电功能,例如:
- Millions of Residential Proxies
- Anti Scraping Protection bypass
- Javascript rendering and headless cloud browsers
- Web dashboard for monitoring and managing scrapers
所有这些工具都可以通过Python SDK轻松访问:
from scrapfly import ScrapeConfig, ScrapflyClient
client = ScrapflyClient(key="YOUR SCRAPFLY KEY")
result = client.scrape(ScrapeConfig(
url="https://www.vestiairecollective.com/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml",
# enable scraper blocking service bypass
asp=True
# optional - render javascript using headless browsers:
render_js=True,
))
print(result.content)
有关Web刮擦的更多信息,请参阅Full Scraper Code部分。
<! - kg-card-end:markdown-> <! - kg-card-begin:markdown->
常问问题
要结束我们有关如何刮擦Vestiaire Collective的指南,让我们看看一些常见问题。
刮擦Vestiaire集体是合法的吗?
是。我们在本教程中刮擦的所有数据都可以公开获得,这是完全合法的。但是,使用刮擦卖方数据时应引起注意,因为它可以受到欧洲GDPR或版权的保护。
Vestiaire集体可以被爬行吗?
是。爬行是网络刮擦的一种形式,刮板可以在其中发现产品列表,而Visetiaire Collective提供了许多发现点,例如建议,搜索和站点地图。
概括
在这个快速教程中,我们研究了如何使用Python刮擦Vestiaire集体。我们介绍了如何使用隐藏的Web数据刮擦方法来快速从HTML页面提取产品数据集。为了找到产品,我们已经介绍了如何使用站点地图快速按类别收集所有产品列表。
为避免阻塞,我们已经看了查看刮擦API缩放解决方案,该解决方案可用于扩展您的刮擦项目以在几分钟之内收集这样的公共数据集!
<! - kg-card-end:markdown-> <! - kg-card-begin:markdown->
Get Your FREE API KeyDiscover ScrapFly
<! - kg-card-end:markdown-> <! - kg-card-begin:markdown->
完整的刮板代码
这是使用python和scrapfly python sdk的完整Vestiaire集体产品刮板:
ð此代码仅应用作参考。要大规模从Vestiaire Collective刮下数据,您需要将其调整为您的偏好和环境
import asyncio
import os
import json
from pathlib import Path
from typing import Dict, List
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
scrapfly = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"], max_concurrency=10)
def find_hidden_data(result: ScrapeApiResponse) -> dict:
"""extract hidden NEXT_DATA from page html"""
data = result.selector.css("script# __NEXT_DATA__ ::text").get()
data = json.loads(data)
return data
async def scrape_product(url: str) -> dict:
"""scrape a single stockx product page for product data"""
result = await scrapfly.async_scrape(
ScrapeConfig(
url=url,
cache=True,
asp=True,
)
)
data = find_hidden_data(result)
product = data["props"]["pageProps"]["product"]
return product
async def scrape_sitemap(url: str, max_pages: int = 100) -> List[Dict]:
"""Scrape Vestiaire Collective sitemap for products"""
print(f"scraping sitemap page: {url}")
result_sitemap = await scrapfly.async_scrape(ScrapeConfig(url=url, asp=True))
product_urls = result_sitemap.selector.css("url>loc::text").getall()
print(f"found {len(product_urls)} products in the sitemap: {url}\n scraping the first {max_pages} products")
product_pages = [ScrapeConfig(url=url, asp=True) for url in product_urls[:max_pages]]
products = []
async for result in scrapfly.concurrent_scrape(product_pages):
# Vestiaire Collective redirects to product category if product is no longer available (sold, deleted etc.)
if any(redirect["http_code"] == 308 for redirect in result.context["redirects"]):
print(f"Product page {result.scrape_config.url} is no longer available")
continue
data = find_hidden_data(result)
products.append(data["props"]["pageProps"]["product"])
return products
async def example_run():
"""
this example run will scrape example product and sitemap for 5 newest items
save them to ./results/product.json and ./results/sitemap.json respectively
"""
out_dir = Path( __file__ ).parent / "results"
out_dir.mkdir(exist_ok=True)
product = await scrape_product("https://www.vestiairecollective.com/women-clothing/knitwear/anine-bing/beige-cotton-anine-bing-knitwear-32147447.shtml")
out_dir.joinpath("product.json").write_text(json.dumps(product, indent=2, ensure_ascii=False))
search = await scrape_sitemap("https://www.vestiairecollective.com/sitemaps/https_en-new_items-1.xml", max_pages=5)
out_dir.joinpath("sitemap.json").write_text(json.dumps(search, indent=2, ensure_ascii=False))
if __name__ == " __main__":
asyncio.run(example_run())
<! - kg-card-end:markdown-> <! - kg-card-begin:html-> {
&quot“ @context&quot” https://schema.org
&quot@type“:;
&quot“主要态度”:[
{
&quot@type;
&quot“ name”:“ scrape vestiaire集体是合法的?
&quot“ Accessedanswer&quot”:{
&quot@type':&quot;
&quot“ text”:&quot'
是。我们在本教程中刮擦的所有数据都可以公开获得,这是完全合法的。但是,使用刮擦卖方数据时应引起注意,因为它可以受到欧洲GDPR或版权的保护。
'}
},
{
&quot@type; &quot“ name” &quot“ Accessedanswer&quot”:{
&quot@type':&quot; &quot“ text”:&quot'
是。爬行是网络刮擦的一种形式,刮板在其自己的和Visetiaire Collective上发现产品列表提供了许多发现点,例如建议,搜索和站点地图。
'}
}
]
} <! - kg-card-end:html->