【发布时间】:2021-01-19 14:11:48
【问题描述】:
我有以下代码
import pandas as pd
import requests
from bs4 import BeautifulSoup
import datetime
import time
url_list = [
'https://www.coolmod.com/componentes-pc-procesadores?f=375::No',
# 'https://www.coolmod.com/componentes-pc-placas-base?f=55::ATX||prices::3-300',
]
df_list = []
for url in url_list:
headers = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'es-ES, es;q=0.5'})
print (url)
r = requests.get(url, headers = headers)
print(r.status_code)
soup = BeautifulSoup(r.content,'html.parser')
items = soup.find_all('div',class_='col-xs-12 col-sm-6 col-sm-6 col-md-6 col-lg-3 col-product col-custom-width')
# print(items)
store = 'Coolmod'
extraction_date = datetime.datetime.today().replace(microsecond=0)
for item in items:
product_name = item.find('div',class_ = 'product-name').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
try:
price = item.find('div', class_ = 'margin-top-20 mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
except AttributeError:
price = item.find('div', class_ = 'mod-product-price text-big').text.strip().replace('\t','').replace('\n', '').replace('\r', '')
except AttributeError:
price = "No price" # .replace('€','').replace('\t','').replace('\n', '').replace('\r', '')
# old_price = item.find(class_ = 'old-price product-price').text[:-2] if item.find(class_ = 'old-price product-price') != None else None
try:
availability = item.find('div', class_ = 'product-availability cat-product-availability').text.replace('\t','').replace('\n', '').replace('\r', '')
# except AttributeError:
# availability = item.find('span', class_ = 'btn-addtocart btn-icon disabled').text.replace('\t','').replace('\n', '').replace('\r', '')
except AttributeError:
availability = "No info"
# stock = [item.find(class_ = 'item-availability').get_text() if item.find(class_ = 'item-availability') != None else None for item in items]
product_info = {
'product_name' : product_name,
'price' : price,
# 'old_price' : old_price,
'availability' : availability,
'store' : store,
'date_extraction' : extraction_date,
}
df_list.append(product_info)
time.sleep(3)
df = pd.DataFrame(df_list)
print(df)
它工作正常并返回具有预期结果的数据框。问题是只检索前二十条记录,然后有一个“显示更多”按钮以获取接下来的二十条产品等等。
我看到并检查了网页代码,但我找不到与按钮交互的方法。
任何想法或建议将不胜感激。
问候。
【问题讨论】:
标签: python-3.x pandas web-scraping beautifulsoup python-requests