【问题标题】:Newbie: Python "AttributeError: 'NoneType' object has no attribute 'text' " when scraping Tripadvisor Reviews新手:在抓取 Tripadvisor 点评时,Python “AttributeError: 'NoneType' object has no attribute 'text'”
【发布时间】:2019-04-16 08:36:35
【问题描述】:

作为一个完整的新手,我正在尝试收集一些 Tripadvisor 评论。

我正在使用来自Susanli2016 的代码。

它对一个link 有效(尽管删除了属性“语言”),但它不适用于任何其他链接(对于example。)

我收到错误消息:

> Traceback (most recent call last):
>           File "<pyshell#27>", line 4, in <module>
>             items = scrape(url)
>           File "<pyshell#12>", line 11, in scrape
>             items = parse(session, url + '?filterLang=' + lang)
>           File "<pyshell#15>", line 12, in parse
>             num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
>         AttributeError: 'NoneType' object has no attribute 'text'

我在此处附上代码以及我所做的更改,以防有人可以帮助我。

非常感谢! 西尔维娅

--

我替换了原来的:

 num_reviews = soup.find('span', class_='reviews_header_count').text # get text

 num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text

使用原始代码我得到了错误 ValueError: int() 以 10 为底的无效文字:'5.695'

(其中 5.695 是页面中的评论数)

--

特此完整代码:

import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
def display(content, filename='output.html'):
    with open(filename, 'wb') as f:
         f.write(content)
         webbrowser.open(filename)


def get_soup(session, url, show=False):
    r = session.get(url)
    if show:
        display(r.content, 'temp.html')
    if r.status_code != 200: # not OK
        print('[get_soup] status code:', r.status_code)
    else:
        return BeautifulSoup(r.text, 'html.parser')


def post_soup(session, url, params, show=False):
    '''Read HTML from server and convert to Soup'''

    r = session.post(url, data=params)

    if show:
        display(r.content, 'temp.html')

    if r.status_code != 200: # not OK
        print('[post_soup] status code:', r.status_code)
    else:
        return BeautifulSoup(r.text, 'html.parser')

def scrape(url, lang='ALL'):

    # create session to keep all cookies (etc.) between requests
    session = requests.Session()

    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
    })


    items = parse(session, url + '?filterLang=' + lang)

    return items

 def parse(session, url):
    '''Get number of reviews and start getting subpages with reviews'''

    print('[parse] url:', url)

    soup = get_soup(session, url)

    if not soup:
        print('[parse] no soup:', url)
        return

    num_reviews = soup.find('span', class_='hotels-hotel-review-community-content-TabBar__tabCount--37DbH').text # get text
    num_reviews = num_reviews[1:-1] 
    num_reviews = num_reviews.replace(',', '')
    num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)

    url_template = url.replace('.html', '-or{}.html')
    print('[parse] url_template:', url_template)

    items = []

    offset = 0

    while(True):
        subpage_url = url_template.format(offset)

        subpage_items = parse_reviews(session, subpage_url)
        if not subpage_items:
            break

        items += subpage_items

        if len(subpage_items) < 5:
            break

        offset += 5

    return items

 def get_reviews_ids(soup):

    items = soup.find_all('div', attrs={'data-reviewid': True})

    if items:
        reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
        print('[get_reviews_ids] data-reviewid:', reviews_ids)
        return reviews_ids

def get_more(session, reviews_ids):

    url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'

    payload = {
        'reviews': ','.join(reviews_ids), # ie. "577882734,577547902,577300887",
        #'contextChoice': 'DETAIL_HR', # ???
        'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX', # ???
        'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
        'haveCsses': 'apg-Hotel_Review-in',
        'Action': 'install',
    }

    soup = post_soup(session, url, payload)

   return soup

 def parse_reviews(session, url):
    '''Get all reviews from one page'''

    print('[parse_reviews] url:', url)

    soup =  get_soup(session, url)

    if not soup:
        print('[parse_reviews] no soup:', url)
        return

   hotel_name = soup.find('h1', id='HEADING').text

    reviews_ids = get_reviews_ids(soup)
    if not reviews_ids:
        return

    soup = get_more(session, reviews_ids)

    if not soup:
        print('[parse_reviews] no soup:', url)
        return

    items = []

    for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):

        badgets = review.find_all('span', class_='badgetext')
        if len(badgets) > 0:
            contributions = badgets[0].text
        else:
            contributions = '0'

        if len(badgets) > 1:
            helpful_vote = badgets[1].text
        else:
            helpful_vote = '0'
        user_loc = review.select_one('div.userLoc strong')
        if user_loc:
            user_loc = user_loc.text
        else:
            user_loc = ''

        bubble_rating = review.select_one('span.ui_bubble_rating')['class']
        bubble_rating = bubble_rating[1].split('_')[-1]

        item = {
            'review_body': review.find('p', class_='partial_entry').text,
            'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
       }

        items.append(item)
        print('\n--- review ---\n')
        for key,val in item.items():
            print(' ', key, ':', val)

    print()

    return items

def write_in_csv(items, filename='results.csv',
                  headers=['hotel name', 'review title', 'review body',
                           'review date', 'contributions', 'helpful vote',
                           'user name' , 'user location', 'rating'],
              mode='w'):

    print('--- CSV ---')

    with io.open(filename, mode, encoding="utf-8") as csvfile:
        csv_file = csv.DictWriter(csvfile, headers)

        if mode == 'w':
            csv_file.writeheader()

        csv_file.writerows(items)


 DB_COLUMN   = 'review_body'
 DB_COLUMN1 = 'review_date'
 start_urls = [
    'https://www.tripadvisor.com/Restaurant_Review-g187823-d2101904-Reviews-Eataly_Genova-Genoa_Italian_Riviera_Liguria.html',
]

headers = [ 
    DB_COLUMN, 
    DB_COLUMN1, 
]

 lang = 'it'


 for url in start_urls:

    # get all reviews for 'url' and 'lang'
    items = scrape(url)

    if not items:
        print('No reviews')
    else:
        # write in CSV
        filename = url.split('Reviews-')[1][:-5]
        print('filename:', filename)
        write_in_csv(items, filename + '.csv', headers, mode='w')

【问题讨论】:

  • 你确定类hotels-hotel-review-community-content-TabBar__tabCount--37DbH存在吗?
  • TripAdvisor 有an API;您可能应该考虑使用它而不是尝试抓取他们的网站。
  • 旅行顾问的 API 没有维护,或者他们只是不响应请求(一年多前我请求访问,没有消息)。
  • 我现在还不确定。但是,它也不适用于原始代码。我可以确认一下 cglacet 关于 TA 的 API 的说法。

标签: python web-scraping sentiment-analysis tripadvisor


【解决方案1】:

我意识到问题出在源代码上。

hotel_name = soup.find('h1', id='HEADING').text

在源网站中找不到目标 ID。我将其替换为:

    hotel_name = soup.find('h1', class_='heading').text

我希望它可以帮助别人!

【讨论】:

    猜你喜欢
    • 2019-06-04
    • 1970-01-01
    • 2022-12-15
    • 2017-10-20
    • 2021-06-12
    • 2022-01-10
    • 1970-01-01
    • 2022-10-25
    • 2021-02-05
    相关资源
    最近更新 更多