【发布时间】:2020-03-09 05:12:25
【问题描述】:
αԋɱҽԃ αмєяιcαη 帮助我构建了这段代码,用于从动态加载评论的页面中抓取评论。然后,我尝试对其进行调整,使其不仅可以抓取评论正文,还可以抓取评论者的姓名、日期和评级,并让代码将提取的数据保存到 Excel 文件中。但我没有这样做。有人可以帮我正确调整代码吗?
这是来自αԋɱҽԃ αмєяιcαη的代码
import requests
from bs4 import BeautifulSoup
import math
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
for item in range(1, num):
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"comment-body\"'):
print(com.text[5:com.text.find(r"\n", 3)])
Main()
这是我调整的代码,但出现了我无法解决的错误
import requests
from bs4 import BeautifulSoup
import math
import pandas as pd
df = pd.DataFrame()
def PageNum():
r = requests.get(
"https://boxes.mysubscriptionaddiction.com/box/boxycharm?ratings=true#review-update-create")
soup = BeautifulSoup(r.text, 'html.parser')
num = int(
soup.find("a", class_="show-more-reviews").text.split(" ")[3][1:-1])
if num % 3 == 0:
return (num / 3) + 1
else:
return math.ceil(num / 3) + 2
def Main():
num = PageNum()
headers = {
'X-Requested-With': 'XMLHttpRequest'
}
with requests.Session() as req:
for item in range(1, num):
names = []
headers = []
bodies = []
ratings = []
published = []
updated = []
reported = []
dateElements = []
print(f"Extracting Page# {item}")
r = req.get(
f"https://boxes.mysubscriptionaddiction.com/get_user_reviews?box_id=105&page={item}", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
for com in soup.findAll("div", class_=r'\"user-review\"'):
names.append(article.find('div', attrs={'class': 'name'}).text.strip())
try:
bodies.append(article.find('div', attrs={'class': 'comment-body'}).text.strip())
except:
bodies.append('NA')
try:
ratings.append(article.find('meta', attrs={'itemprop': 'ratingValue'})['content'])
except:
ratings.append('NA')
dateElements.append(article.find('div', attrs={'class': 'comment-date'}).text.strip())
print(com.text[5:com.text.find(r"\n", 3)])
temp_df = pd.DataFrame(
{'User Name': names, 'Body': bodies, 'Rating': ratings, 'Published Date': dateElements})
df = df.append(temp_df, sort=False).reset_index(drop=True)
Main()
df.to_csv('Allure10.csv', index=False, encoding='utf-8')
print ('excel done')
【问题讨论】:
-
你在哪里定义文章?
标签: python pandas web-scraping beautifulsoup element