【发布时间】:2021-09-30 15:48:07
【问题描述】:
from bs4 import BeautifulSoup
import grequests
import pandas as pd
# STEP 1: Create List of URLs from main archive page
def get_urls():
urls = []
for x in range(1,3):
urls.append(f'http://books.toscrape.com/catalogue/page-{x}.html')
print(f'Getting page url: {x}', urls)
return urls
# STEP 2: Async Load HTML Content from page range in step 1
def get_data(urls):
reqs = [grequests.get(link) for link in urls]
print('AsyncRequest object > reqs:', reqs)
resp = grequests.map(reqs)
print('Status Code > resp (info on page):', resp, '\n')
return resp
# Step 3: Extract title, author, date, url, thumb from asynch variable resp containing html elements of all scraped pages.
def parse(resp):
productlist = []
for r in resp:
#print(r.request.url)
sp = BeautifulSoup(r.text, 'lxml')
items = sp.find_all('article', {'class': 'product_pod'})
#print('Items:\n', items)
for item in items:
product = {
'title' : item.find('h3').text.strip(),
'price': item.find('p', {'class': 'price_color'}).text.strip(),
'single_url': 'https://books.toscrape.com/catalogue/' + item.find(('a')).attrs['href'],
'thumbnail': 'https://books.toscrape.com/' + item.find('img', {'class': 'thumbnail'}).attrs['src'],
}
productlist.append(product)
print('Added: ', product)
return productlist
urls = get_urls() # (Step 1)
resp = get_data(urls) # (Step 2)
df = pd.DataFrame(parse(resp)) # (Step 3)
df.to_csv('books.csv', index=False)
上述脚本通过异步抓取主存档页面或网站https://books.toscrape.com/的页面使用grequests和美丽的汤。
在存档页面中,它会提取以下图书信息:
- 标题
- 价格
- 单一产品网址
- 缩略图网址
问题
我需要一种方法来进一步从单个产品页面中提取信息以获取 UPC 等信息并将信息关联回主数组 productlist。
单一产品页面示例:https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html
【问题讨论】:
标签: python web-scraping beautifulsoup grequests