【发布时间】:2020-10-29 00:57:40
【问题描述】:
这是我关于网络抓取的案例研究。 我在最终代码中遇到问题 'NoneType' 对象没有属性 'text' 所以我尝试使用 'getattr' 函数来修复它,但它不起作用。
'''
import requests
from bs4 import BeautifulSoup
url = 'https://www.birdsnest.com.au/womens/dresses'
source = requests.get(url)
soup = BeautifulSoup(source.content, 'lxml')
'''
productlist= soup.find_all('div', id='items')
'''
productlinks = []
for item in productlist:
for link in item.find_all('a',href=True):
productlinks.append(url + link['href'])
print(len(productlinks))
'''
productlinks = []
for x in range(1,28):
source = requests.get(f'https://www.birdsnest.com.au/womens/dresses?_lh=1&page={x}')
soup = BeautifulSoup(source.content, 'lxml')
for item in productlist:
for link in item.find_all('a',href=True):
productlinks.append(url + link['href'])
print(productlinks)
'''
for link in productlinks:
source = requests.get(link)
soup = BeautifulSoup(source.content, 'lxml')
name = soup.find('h1',class_='item-heading__name').text.strip()
price = soup.find('p',class_='item-heading__price').text.strip()
feature = soup.find('div',class_='tab-accordion__content active').text.strip()
sum = {
'name':name,
'price':price,
'feature':feature
}
print(sum)
'''
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-7-d4d46558690d> in <module>()
3 soup = BeautifulSoup(source.content, 'lxml')
4
----> 5 name = soup.find('h1',class_='item-heading__name').text.strip()
6 price = soup.find('p',class_='item-heading__price').text.strip()
7 feature = soup.find('div',class_='tab-accordion__content active').text.strip()
AttributeError: 'NoneType' object has no attribute 'text'
---------------------------------------------------------------------------
所以我尝试用这种方法修复,但没有奏效。
for link in productlinks:
source = requests.get(link)
soup = BeautifulSoup(source.content, 'lxml')
name = getattr(soup.find('h1',class_='item-heading__name'),'text',None)
price = getattr(soup.find('p',class_='item-heading__price'),'text',None)
feature = getattr(soup.find('div',class_='tab-accordion__content active'),'text',None)
sum = {
'name':name,
'price':price,
'feature':feature
}
print(sum)
这是输出。它只显示“Nonetype”
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
{'name': None, 'price': None, 'feature': None}
【问题讨论】:
-
这是
productlinks的第一个链接:birdsnest.com.au/womens/dresses/brands/honeysuckle-beach/…这个链接被屏蔽了。 -
好吧,如果您转到 productlinks 列表中的某个链接,它们会指向一个空白页面
标签: python selenium web-scraping beautifulsoup google-colaboratory