【发布时间】:2021-07-16 14:52:40
【问题描述】:
我正在尝试从 homedepot 抓取产品信息,例如价格、产品详细信息、规格、图片等。我能够抓取所有这些信息,但现在我不知道如何在产品中选择不同的选项组合时抓取价格,因为价格会根据这些选项而变化。有什么方法可以为产品选项中的每个可能组合刮取价格和图片?
要更清楚地解释我的问题,请参阅该产品网址
您可以在映像右侧看到某些选项每个参数有多个选项和这些选项的每个选项组合时,选择它更改图像和价格。如果可能,我该如何抓取这些信息?
注意:我正在使用 selenium 和 BeautifulSoup
更新:
到目前为止,这是我在产品页面中抓取选项部分的代码
def scrape_price(self):
if self.soup.find("div", attrs={"class":"price-format__large price-format__main-price"}):
price_div = self.soup.find("div", attrs={"class":"price-format__large price-format__main-price"})
price_curr = price_div.findAll("span")[0].text
price_doll = price_div.findAll("span")[1].text
price_cent=""
if len(price_div.findAll("span")) > 2:
price_cent = price_div.findAll("span")[2].text
if price_cent != "":
self.data['price']=price_curr+price_doll+"."+price_cent
else:
self.data['price']=price_curr+price_doll
else:
if self.soup.find("div",attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"}):
price_div = self.soup.find("div",attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"})
price_span = price_div.find("span",attrs={"class":"price-detailed__unit-price"}).find("span").text
#unit_span = price_div.findAll("span")[1].text
self.data['price']=price_span
else:
if self.soup.find("div", attrs={"class":"pricingReg"}):
price_div = self.soup.find("div", attrs={"class":"pricingReg"})
curr = price_div.find("span", attrs={"class":"price__currency"}).text
dollars = price_div.find("span", attrs={"class":"price__dollars"}).text
cents = price_div.find("span", attrs={"class":"price__cents"}).text
price = curr+dollars+"."+cents
self.data['price']=price
self.data['Availability'] = "Available"
else:
self.data['Availability'] = "Not Available"
if self.soup.find("div", attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"}):
detailed_price_tag = self.soup.find("div", attrs={"class":"price-detailed__left-price-wrapper price-detailed__left-price-row"})
detailed_price = cleanhtml(detailed_price_tag.text)
self.data["Detailed Price"] = detailed_price
if self.soup.find("div", attrs={"class":"price-detailed__unit-cover"}):
self.data["Case Unit Cover"] = self.soup.find("div", attrs={"class":"price-detailed__unit-cover"}).text
def scrape_images(self):
if self.soup.findAll("button", attrs={'class':"mediagallery__imgblock"}):
img_btns = self.soup.findAll("button", attrs={'class':"mediagallery__imgblock"})
count=0
self.data["images"]=[]
for img_btn in img_btns:
img_url = img_btn.find("img").get("src")
self.data["images"].append(img_url)
count+=1
else:
if self.soup.find("div", attrs={"class":"styles__ThumbnailList-sc-10zajq9-5 gyXsdF"}):
images_div = self.soup.find("div", attrs={"class":"styles__ThumbnailList-sc-10zajq9-5 gyXsdF"})
images_divs = images_div.findAll("div", attrs={"class":"styles__ThumbnailInner-sc-10zajq9-1 icLycq"})
imgs=[]
for image_div in images_divs:
if image_div.find("img"):
img_src = image_div.find("img").get("src")
imgs.append(img_src)
self.data["images"]= imgs
def scrape_options(self):
if self.soup.find("div", attrs={"class":"super-sku"}):
param_tag = self.soup.find("div", attrs={"class":"super-sku"})
params = param_tag.findAll("div", attrs={"class":"super-sku__inline-attribute"})
parameters=[]
for param in params:
param_body = param.find("div", attrs={"class":"label"}).text
cleaned_param = cleanhtml(param_body)
splitted = cleaned_param.split(':')
label = splitted[0]
val = splitted[1]
options_div=param.findAll("div", attrs={"class":"super-sku__inline-tile--space"})
if len(options_div) == 0:
options_div=param.findAll("button", attrs={"class":"super-sku__inline-swatch"})
options=[]
for opt_div in options_div:
if opt_div.find("img"):
opt = {
"img" : opt_div.find("img").get("src"),
"label":opt_div.find("img").get("title")
}
else:
opt = opt_div.find("button").text
options.append(opt)
parameters.append({
"Label":label,
"Value":val,
"Options":options
})
self.data["Parameters"] = parameters
else:
if self.soup.find("div", attrs={"class":"buybox__super-sku"}):
options=[]
options_divs = self.soup.find("div", attrs={"class":"buybox__super-sku"}).find_all("div",recursive=False)
for option_div in options_divs:
option={}
optionheader0 = option_div.find("div", attrs={"class":"styles__HeaderRow-fb29x6-1"})
optionheader1 = option_div.find("div", attrs={"class":"styles__Header-sc-1gql1zk-0"})
if optionheader0 or optionheader1:
if optionheader0:
header_div = optionheader0
else:
header_div = optionheader1
if header_div.find("span", attrs={"class":"styles__Label-sc-1gql1zk-1"}):
label = header_div.find("span", attrs={"class":"styles__Label-sc-1gql1zk-1"}).text
option["Label"] = label
if header_div.find("span", attrs={"class":"styles__Value-sc-1gql1zk-2"}):
value = header_div.find("span", attrs={"class":"styles__Value-sc-1gql1zk-2"}).text
option["Value"] = value
optionchoices0 = option_div.find("div", attrs={"class":"DefaultTemplate__FixedSizeChoiceImageWrapper-rpf825-0"})
optionchoices1 = option_div.find("div", attrs={"class":"styles__TileSelectWrapper-jw86q8-1"})
optionchoices2 = option_div.find("div", attrs={"class":"product_sku_Overlay_ListBoxes"})
optionchoices3 = option_div.find("div", attrs={"class":"product_sku_Overlay_ColorSwtHolder"})
if optionchoices0 or optionchoices1 or optionchoices2 or optionchoices3:
if optionchoices0:
choices_div = optionchoices0
choices=[]
choices_images=choices_div.findAll("div",attrs={"class":"styles__ChoiceImage-kykx13-4"})
for choice_div in choices_images:
if choice_div.find("img"):
choice_img = choice_div.find("img").get("src")
choice_val = choice_div.find("img").get("alt")
choices.append({
"img":choice_img,
"value":choice_val
})
option["choices"]=choices
elif optionchoices2:
choices_div = optionchoices2
choices=[]
choices_images=choices_div.findAll("span",attrs={"class":"drop-down__hover-effect"})
for choice_div in choices_images:
if choice_div.find("a"):
choice_text = choice_div.find("a").text
choices.append(choice_text)
option["choices"]=choices
elif optionchoices3:
choices_div = optionchoices3
choices=[]
choices_images=choices_div.findAll("li",attrs={"class":"styles__SwatchRoot-sc-1kr5yl9-1"})
for choice_div in choices_images:
if choice_div.find("img"):
choice_img = choice_div.find("img").get("src")
choice_val = choice_div.find("img").get("title")
choices.append({
"img":choice_img,
"value":choice_val
})
option["choices"]=choices
else:
choices_div = optionchoices1
choices=[]
choices_images=choices_div.findAll("div",attrs={"class":"styles__TileDiv-jw86q8-0"})
for choice_div in choices_images:
choice_text = choice_div.text
choices.append(choice_text)
option["choices"]=choices
options.append(option)
self.data["options"] = options
现在我想知道如何获取这些选项的每种组合的价格
【问题讨论】:
-
请告诉我们你的代码,你到底做了什么,你遇到了什么错误等等。
-
我没有遇到任何错误,我在页面中抓取了标题、价格、图片、规格、品牌和所有数据,甚至是选项列表。我的问题是我还想为每个选项组合抓取图像和价格,但我不知道如何实现这就是为什么我要征求意见
-
再次,为此我们必须查看您的代码。
-
用代码更新了问题。所以你能帮我告诉我如何完成我需要的吗?
-
我会试试的。如果不是我 - 也许是其他人。
标签: python selenium web-scraping beautifulsoup scrape