【问题标题】:Scrapy Spider scraping same thing multiple times and missing other itemsScrapy Spider 多次抓取相同的东西并丢失其他物品
【发布时间】:2023-03-09 21:40:01
【问题描述】:

当我在steam平台上运行我的scrapy spider刮cmets时,它丢失了很多cmets并且多次刮取相同的cmets。我的代码有什么问题?

import scrapy
from scrapy import Request, FormRequest
from scrapy.item import Item, Field
from scrapy.loader import ItemLoader
import json
from scrapy import Selector
import re


class Workshop_Item(Item):
    app_id = Field()
    workshop_id = Field()
    game = Field()
    workshop_name = Field()
    user = Field()
    comment = Field()
    user_level = Field()
    date_posted = Field()
    user_location = Field()
    number_of_badges = Field()
    user_join_date = Field()
    is_author = Field()
    user_experience = Field()


class Workshop_Comment_Spider(scrapy.Spider):
    name = "comments"
    with open("output/workshop_comment_links.txt") as f:
        urls = [line.rstrip("\n") for line in f]
    start_urls = urls

    def parse(self, response):
        if int(max(response.css('span.tabCount::text').getall())) > 0 and "profiles" in response.css('a.commentthread_author_link::attr(href)').get():
            contributor_id = re.search(r'Public_(.*?)_' , response.css('div.commentthread_footer a::attr(id)').get()).group(1)
        elif int(max(response.css('span.tabCount::text').getall())) > 0:
            contributor_id = re.search(r'Public_(.*?)_' , response.css('div.commentthread_footer a::attr(id)').get()).group(1)
        workshop_id_number = response.css('form.smallForm > input::attr(value)').get()

        if int(max(response.css('span.tabCount::text').getall())) > 50:

            comment_number = max(response.css('span.tabCount::text').getall())

            url = f'https://steamcommunity.com/comment/PublishedFile_Public/render/{contributor_id}/{workshop_id_number}/'

            data = {
                "start": "1",
                "totalcount": comment_number,
                "count": comment_number,
                "sessionid": "d880ab2338b70926db0a9591",
                "extended_data": "{\"contributors\":[\"" + contributor_id +"\",{}],\"appid\":289070,\"sharedfile\":{\"m_parentsDetails\":null,\"m_parentBundlesDetails\":null,\"m_bundledChildren\":[],\"m_ownedBundledItems\":[]},\"parent_item_reported\":false}",
                "feature2": "-1"
            }

            app_id = response.css('div.apphub_HeaderTop a::attr(data-appid)').get()
            game = response.css(".apphub_AppName::text").get()
            workshop_id = response.css('form.smallForm input::attr(value)').get()
            workshop_name = response.css(".workshopItemTitle::text").get()

            yield FormRequest(url, formdata=data, callback=self.parse_paginated_comments, meta={'app_id': app_id, 'game': game, 'workshop_id': workshop_id, 'workshop_name': workshop_name})


        else:
            for comment in response.css(".commentthread_comment"):
                item = Workshop_Item()
                item['is_author'] = False

                if "authorbadge" in comment.get():
                    item['is_author'] = True

                item['app_id'] = response.css('div.apphub_HeaderTop a::attr(data-appid)').get()
                item['workshop_id'] = response.css('form.smallForm input::attr(value)').get()
                item['game'] = response.css(".apphub_AppName::text").get()
                item['workshop_name'] = response.css(".workshopItemTitle::text").get()
                item['user'] = comment.css("bdi::text").get()
                item['comment'] = ",".join(comment.css(".commentthread_comment_text::text").getall()).replace('\n', ' ').replace('\t', '').replace('\r', ' ')
                item['date_posted'] = comment.css(".commentthread_comment_timestamp::attr(title)").get()
                item['user_level'] = -1
                user_profile = comment.css(".commentthread_author_link::attr(href)").get()
                request = Request(user_profile, callback=self.parse_user_info, meta={'item': item})
                yield request

    def parse_user_info(self, response):
        item = response.meta['item']
        if response.css('.profile_private_info'):
            item['user_level'] = 'private'
            item['user_location'] = 'private'
            item['number_of_badges'] = 'private'
            item['user_join_date'] = 'private'
            item['user_experience'] = 'private'
            return item
        else:
            item['user_level'] = response.css(".friendPlayerLevelNum::text").get()

        if response.css('.header_real_name') and response.css("img.profile_flag"):
            item['user_location'] = response.css('.header_real_name::text').getall()[2].strip()
        else:
            item['user_location'] = 'NA'

        if response.css("div.profile_badges span.profile_count_link_total::text"):
            item['number_of_badges'] = response.css("div.profile_badges span.profile_count_link_total::text").get().strip()
        else:
            item['number_of_badges'] = 'NA'

        user_badge_page = response.css("div.profile_header_badgeinfo_badge_area > a::attr(href)").get() + "/1"
        request = Request(user_badge_page, callback=self.parse_badge_info, meta={'item': item})
        yield request

    def parse_badge_info(self, response):
        item = response.meta['item']
        if response.css("div.badge_description"):
            item['user_join_date'] = response.css("div.badge_description::text").get().strip()
        experience_page = response.css('a.whiteLink.persona_name_text_content::attr(href)').get() + "/badges"
        request = Request(experience_page, callback=self.parse_experience_page, meta={'item': item})
        yield request
        
    def parse_experience_page(self, response):
        item = response.meta['item']
        if response.css('span.profile_xp_block_xp'):
            item['user_experience'] = response.css('span.profile_xp_block_xp::text').get()
        return item

    def parse_paginated_comments(self, response):
        app_id = response.meta['app_id']
        game = response.meta['game']
        workshop_id = response.meta['workshop_id']
        workshop_name = response.meta['workshop_name']

        jsonresponse = json.loads(response.body.decode("utf-8"))
        sel = Selector(text=jsonresponse['comments_html'])
        for comment in sel.css(".commentthread_comment"):
            item = Workshop_Item()
            item['is_author'] = False


            if "authorbadge" in comment.get():
                item['is_author'] = True

            item['app_id'] = app_id #sel.css('div.apphub_HeaderTop a::attr(data-appid)').get()
            item['workshop_id'] = workshop_id #sel.css('form.smallForm input::attr(value)').get()
            item['game'] = game #sel.css(".apphub_AppName::text").get()
            item['workshop_name'] = workshop_name #sel.css(".workshopItemTitle::text").get()
            item['user'] = comment.css("bdi::text").get()
            item['comment'] = ",".join(comment.css(".commentthread_comment_text::text").getall()).replace('\n', ' ').replace('\t', '').replace('\r', ' ')
            item['date_posted'] = comment.css(".commentthread_comment_timestamp::attr(title)").get()
            item['user_level'] = -1
            user_profile = sel.css(".commentthread_author_link::attr(href)").get()
            request = Request(user_profile, callback=self.parse_user_info, meta={'item': item})
            yield request

我正在从页面上抓取评论,然后转到用户的个人资料以收集用户数据。如果页面有分页(>50 cmets),我将发送一个 post 请求来检索包含所有 cmets 的 html 的 json,然后将其抓取。

【问题讨论】:

  • 首先您可以使用print() 查看变量中的内容以及执行的代码部分。它被称为"print debuging"

标签: python scrapy


【解决方案1】:

修复它,问题就在这里:

  def parse_paginated_comments(self, response):
        app_id = response.meta['app_id']
        game = response.meta['game']
        workshop_id = response.meta['workshop_id']
        workshop_name = response.meta['workshop_name']

        jsonresponse = json.loads(response.body.decode("utf-8"))
        sel = Selector(text=jsonresponse['comments_html'])
        for comment in sel.css(".commentthread_comment"):
            item = Workshop_Item()
            item['is_author'] = False

我需要改变

for comment in sel.css(".commentthread_comment"):

for comment in comment.css(".commentthread_comment"):

我需要添加

DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'

到 settings.py 文件。

【讨论】:

    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 2022-01-18
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2014-10-06
    • 1970-01-01
    • 2020-10-03
    相关资源
    最近更新 更多