【问题标题】:How to scrape multiple pages with scrapy?如何用scrapy抓取多个页面?
【发布时间】:2018-05-09 12:36:49
【问题描述】:

我正在尝试抓取包含多个页面的表格。使用以下代码打印第一页数据:

import scrapy
from scrapy.http.request import Request
from indicators.items import EducationIndicators

class mySpider(scrapy.Spider):
    name = "education2"
    allowed_domains = ["data.un.org"]
    start_urls = (
        'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
        )

    def parse(self, response):
        return Request(
            url='http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
            callback=self.parse_table
        )

    def parse_table(self,response):
        sel = response.selector
        for tr in sel.xpath('//*[@id="divData"]/div/table/tr'):
            item =  EducationIndicators()
            item['country'] = tr.xpath('td[1]/text()').extract_first()
            item['years'] = tr.xpath('td[position()>1]/text()').extract() 
            print(item)
            yield item

我已经编写了下一个代码来下载所有页面。它基于我读过的其他帖子:

import scrapy
from scrapy.http.request import Request
from scrapy.spiders import CrawlSpider,Rule
from indicators.items import EducationIndicators
from scrapy.linkextractors import LinkExtractor
from lxml import html

class mySpider(CrawlSpider):
    name = "education3"
    allowed_domains = ["data.un.org"]
    start_urls = (
        'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
        )

    rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@id="linkNextB"]',)), callback="parse_table", follow= True),)

    def parse_table(self,response):
        sel = response.selector
        for tr in sel.xpath('//*[@id="divData"]/div/table/tr'):
            item =  EducationIndicators()
            item['country'] = tr.xpath('td[1]/text()').extract_first()
            item['years'] = tr.xpath('td[position()>1]/text()').extract() 
            print(item)
            yield item

当我尝试打印所有页面时,我什么也得不到。谁能帮我知道是什么错误?

【问题讨论】:

    标签: scrapy web-crawler


    【解决方案1】:

    Scrapy首先需要parse回调。 Scrapy doc

    import scrapy
    from scrapy.http.request import Request
    from scrapy.spiders import CrawlSpider,Rule
    from indicators.items import EducationIndicators
    from scrapy.linkextractors import LinkExtractor
    from lxml import html
    
    class mySpider(CrawlSpider):
        name = "education3"
        allowed_domains = ["data.un.org"]
        start_urls = (
            'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
            )
    
        rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@id="linkNextB"]',)), callback="parse", follow= True),)
    
        def parse(self,response):
            for tr in response.xpath('//*[@id="divData"]/div/table/tr'):
                item =  EducationIndicators()
                item['country'] = tr.xpath('./td[1]/text()').extract_first()
                item['years'] = tr.xpath('./td[position()>1]/text()').extract() 
                print(item)
                yield item
    

    或者只是用其他回调重写start_request方法:

    import scrapy
    from scrapy.http.request import Request
    from scrapy.spiders import CrawlSpider,Rule
    from indicators.items import EducationIndicators
    from scrapy.linkextractors import LinkExtractor
    from lxml import html
    
    class mySpider(CrawlSpider):
        name = "education3"
        allowed_domains = ["data.un.org"]
        start_urls = (
            'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
            )
    
        rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//*[@id="linkNextB"]',)), callback="parse_table", follow= True),)
    
        def start_requests(self):
            for url in self.start_urls:
                yield Request(url, callback=self.parse_table)
    
    
        def parse_table(self,response):
            for tr in response.xpath('//*[@id="divData"]/div/table/tr'):
                item =  EducationIndicators()
                item['country'] = tr.xpath('./td[1]/text()').extract_first()
                item['years'] = tr.xpath('./td[position()>1]/text()').extract() 
                print(item)
                yield item
    

    下面是爬取所有页面的代码:

    import scrapy
    from scrapy.http.request import Request
    from scrapy.spiders import CrawlSpider,Rule
    from indicators.items import EducationIndicators
    from scrapy.linkextractors import LinkExtractor
    from lxml import html
    
    from w3lib.url import add_or_replace_parameter
    
    class mySpider(CrawlSpider):
        name = "education3"
        allowed_domains = ["data.un.org"]
        start_urls = (
            'http://data.un.org/Data.aspx?d=UNESCO&f=series%3ANER_1',
            )
    
    
        api_url = 'http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=series:NER_1&DataMartId=UNESCO'
    
        def parse(self, response):
            max_page = int(response.xpath('//*[@id="spanPageCountB"]/text()').re_first(r'\d+', '0'))
            for page in range(1, max_page + 1):
                yield Request(
                    url=add_or_replace_parameter(self.api_url, 'Page', page),
                    callback=self.parse_table)
    
    
        def parse_table(self,response):
            for tr in response.xpath('//table/tr'):
                item =  EducationIndicators()
                item['country'] = tr.xpath('./td[1]/text()').extract_first()
                item['years'] = tr.xpath('./td[position()>1]/text()').extract() 
                print(item)
                yield item
    

    【讨论】:

    • 这样我只收到第一页。我尝试使用 CrawlSpider 来获取多个页面,但我读过我应该定义另一种方法,而不是使用 parse,因为它在内部使用它
    • 尝试使用他们的 API,http://data.un.org/Handlers/DataHandler.ashx?Service=page&Page=2&DataFilter=series:NER_1&DataMartId=UNESCO&UserQuery=&c=2,3,5,7,9,10&s=ref_area_name:asc,time_period:desc&RequestId=607。这是一个Page 参数。
    • 如何更改url中的页数以抓取下一页?请求 ID 也发生了变化,但我不知道这是否重要,或者 url 是否以第一个 "," 结尾
    • 我已经尝试更改 url(页码),但是当我执行它时只打印最后一页
    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2021-12-13
    • 2018-07-28
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多