Scrapy Spider 错误处理答案

【问题标题】：Scrapy Spider error processingScrapy Spider 错误处理
【发布时间】：2015-10-02 22:19:33
【问题描述】：

我已经使用scrapy 一年多了，还有一个别人为我写的脚本。它运行了一年多，直到 6-8 周前，当我尝试下载时它开始给我以下错误。有人有什么想法吗？

我在 Ubuntu 14.04 LTS 上运行它。

命令：scrapy crawl googleplay

2015-08-30 13:10:31-0400 [googleplay] ERROR: Spider error processing <GET https://accounts.google.com/ServiceLogin?continue=https%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fcategory%2FGAME&followup=https%3A%2F%2Fplay.google.com%2Fstore%2Fapps%2Fcategory%2FGAME&passive=1209600&service=googleplay>
    Traceback (most recent call last):
      File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 800, in runUntilCurrent
        call.func(*call.args, **call.kw)
      File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 595, in _tick
        taskObj._oneWorkUnit()
      File "/usr/lib/python2.7/dist-packages/twisted/internet/task.py", line 472, in _oneWorkUnit
        result = self._iterator.next()
      File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 57, in <genexpr>
        work = (callable(elem, *args, **named) for elem in iterable)
    --- <exception caught here> ---
      File "/usr/lib/pymodules/python2.7/scrapy/utils/defer.py", line 96, in iter_errback
        yield next(it)
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/offsite.py", line 23, in process_spider_output
        for x in result:
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/referer.py", line 22, in <genexpr>
        return (_set_referer(r) for r in result or ())
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/urllength.py", line 33, in <genexpr>
        return (r for r in result or () if _filter(r))
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/spidermiddleware/depth.py", line 50, in <genexpr>
        return (r for r in result or () if _filter(r))
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 73, in _parse_response
        for request_or_item in self._requests_to_follow(response):
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/spiders/crawl.py", line 52, in _requests_to_follow
        links = [l for l in rule.link_extractor.extract_links(response) if l not in seen]
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py", line 129, in extract_links
        links = self._extract_links(body, response.url, response.encoding, base_url)
      File "/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py", line 29, in _extract_links
        self.feed(response_text)
      File "/usr/lib/python2.7/sgmllib.py", line 104, in feed
        self.goahead(0)
      File "/usr/lib/python2.7/sgmllib.py", line 174, in goahead
        k = self.parse_declaration(i)
      File "/usr/lib/python2.7/markupbase.py", line 98, in parse_declaration
        decltype, j = self._scan_name(j, i)
      File "/usr/lib/python2.7/markupbase.py", line 392, in _scan_name
        % rawdata[declstartpos:declstartpos+20])
      File "/usr/lib/python2.7/sgmllib.py", line 111, in error
        raise SGMLParseError(message)
    sgmllib.SGMLParseError: expected name token at '<!\\\\])/g,"\\\\$1").rep'

这是我的 GooglePlay 蜘蛛（更新后）以及我现在收到的错误消息

import string
import requests
from scrapy import log
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from scraper.items import ApkItem
from play import parse_app

class GooglePlaySpider(CrawlSpider):
        name = 'googleplay'
        start_urls = [
                'https://play.google.com/store/apps'
        ]
        rules = (
                Rule(SgmlLinkExtractor(allow=('/store/apps$', )), callback='parse_category_group', follow=True),
                Rule(SgmlLinkExtractor(allow=('/store/apps/category/.*', )), callback='parse_category', follow=True),
                Rule(SgmlLinkExtractor(allow=('/store/search\?.*', )), callback='parse_search', follow=True),
        )

        def parse_category_group(self, response):
                sel = Selector(response)
                category_groups = sel.xpath('//div[@class="padded-content3 app-home-nav"]')

                for category_group in category_groups:

                        category_group_name = category_group.xpath('h2/a/text()').extract()

                        categories = category_group.xpath('ul/li')
                        for category in categories:
                                category_name = category.xpath('a/text()').extract()
                                category_url = category.xpath('a/@href').extract()[0]

                chars = string.ascii_uppercase + string.digits
                for x in chars:
                        yield Request('https://play.google.com/store/search?q=' + x + '&c=apps', callback=self.parse_search)

                for x in chars:
                        for y in chars:
                                yield Request('https://play.google.com/store/search?q=' + x + y + '&c=apps', callback=self.parse_search)

                for x in chars:
                        for y in chars:
                                for z in chars:
                                        yield Request('https://play.google.com/store/search?q=' + x + y + z + '&c=apps', callback=self.parse_search)        

                return

        def parse_category(self, response):
                base_path = response.url.split('?')[0]  

                if '/collection/' in response.url:
                        sel = Selector(response)
                        apps = sel.xpath('//a[@class="title"]')
                        has_app = False

                        for app in apps:
                                has_app = True
                                app_name = app.xpath('text()').extract()
                                app_url = app.xpath('@href').extract()
                                yield Request('https://play.google.com' + app_url[0], meta={'come_from': self.name}, callback=parse_app)

                        if has_app:
                                m = re.match(r'(.*)\?start=(\d+)&num=24', response.url)
                                if m is None:
                                        start_number = 24                  
                                else:
                                        start_number = int(m.group(2)) + 24
                                yield Request(base_path + '?start=' + str(start_number) + '&num=24', callback=self.parse_category)

                return

        def parse_search(self, response):
                m = re.match(r'(.*)&start=(\d+)&num=24', response.url)
                if m is None:
                        base_path = response.url
                        start_number = 24                  
                else:
                        start_number = int(m.group(2)) + 24
                        base_path = m.group(1)

                sel = Selector(response)
                apps = sel.xpath('//a[contains(@href,"/store/apps/details")]')
                has_app = False

                for app in apps:
                        has_app = True
                        app_url = app.xpath('@href').extract()
                        yield Request('https://play.google.com' + app_url[0], meta={'come_from': self.name}, callback=parse_app)

                if has_app:
                        yield Request(base_path + '&start=' + str(start_number) + '&num=24', callback=self.parse_search)

                return

**** Error  ****Traceback (most recent call last):
  File "/usr/bin/scrapy", line 4, in <module>
    execute()
  File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 143, in execute
    _run_print_help(parser, _run_command, cmd, args, opts)
  File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 89, in _run_print_help
    func(*a, **kw)
  File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in _run_command
    cmd.run(args, opts)
  File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 47, in run
    crawler = self.crawler_process.create_crawler()
  File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 87, in create_crawler
    self.crawlers[name] = Crawler(self.settings)
  File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 25, in __init__
    self.spiders = spman_cls.from_crawler(self)
  File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 35, in from_crawler
    sm = cls.from_settings(crawler.settings)
  File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 31, in from_settings
    return cls(settings.getlist('SPIDER_MODULES'))
  File "/usr/lib/pymodules/python2.7/scrapy/spidermanager.py", line 22, in __init__
    for module in walk_modules(name):
  File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 68, in walk_modules
    submod = import_module(fullpath)
  File "/usr/lib/python2.7/importlib/__init__.py", line 37, in import_module
    __import__(name)
  File "/home/darwin/ProjectKrutz/scraper/scraper/spiders/googlePlaySpider.py", line 12, in <module>
    class GooglePlaySpider(CrawlSpider):
  File "/home/darwin/ProjectKrutz/scraper/scraper/spiders/googlePlaySpider.py", line 18, in GooglePlaySpider
    Rule(SgmlLinkExtractor(allow=('/store/apps$', )), callback='parse_category_group', follow=True),
NameError: name 'SgmlLinkExtractor' is not defined

【问题讨论】：

也许你需要使用lxml提取器。

标签： python scrapy scrapy-spider

【解决方案1】：

问题是 SgmlLinkExtractor 与 cmets 有问题。并且错误信息告诉你有一条评论：<!。

因此解决方案是更改您正在使用的蜘蛛并将 SgmlLinkExtractor 替换为任一

from scrapy.contrib.linkextractors.htmlparser import HtmlParserLinkExtractor

或

from scrapy.contrib.linkextractors.lxmlhtml import LxmlParserLinkExtractor

当然，这些只是导入语句，您必须更改使用链接提取器的Rule 以使用这些提取器之一。

如果没有代码，我无法为您提供更多关于在哪里更改部件的建议。

【讨论】：

感谢您的回复。最好向您展示哪些文件？就像我在最初的帖子中所说的那样，我只是觉得奇怪它工作了这么长时间（收集了超过 100k 的 APK 文件）然后就停止了，没有对代码进行任何更改。
我猜这个网站会随着时间的推移而改变，这会杀死蜘蛛。蜘蛛就足够显示了——但我想你也可以更换部件：只需将 SgmlLinkExtractor 替换为上述两个之一即可。