分页抓取博客园新闻,先从列表里分析下一页按钮

相关代码:

# -*- coding: utf-8 -*-
import scrapy

from cnblogs.items import ArticleItem

class BlogsSpider(scrapy.Spider):
    name = 'blogs'
    allowed_domains = ['news.cnblogs.com']
    start_urls = ['https://news.cnblogs.com/']

    def parse(self, response):
        articleList=response.css('.content')

        for item in articleList:
           # 由于详情页里浏览次数是js动态加载的,无法获取,这里需要传递过去
            viewcount = item.css('.view::text').extract_first()[:-3].strip()
            detailurl = item.css('.news_entry a::attr(href)').extract_first()
            detailurl = response.urljoin(detailurl)
            yield scrapy.Request(url=detailurl, callback=self.parse_detail, meta={"viewcount": viewcount})
        #获取下一页标签
        text=response.css('#sideleft > div.pager > a:last-child::text').extract_first().strip()
        if text=='Next >':
            next = response.css('#sideleft > div.pager > a:last-child::attr(href)').extract_first()
            url=response.urljoin(next)

            yield scrapy.Request(url=url,callback=self.parse)

    ##解析详情页内容
    def parse_detail(self, response):
        article=ArticleItem()
        article['linkurl']=response.url
        article['title']=response.css('#news_title a::text').extract_first()
        article['img'] = response.css('#news_content img::attr(src)').extract_first("default.png")
        article['source'] = response.css('.news_poster ::text').extract_first().strip()
        article['releasetime'] = response.css('.time::text').extract_first()[3:].strip()
        article['viewcount']= response.meta["viewcount"]
        article['content']=response.css('#news_body').extract_first("")

        yield article
View Code

相关文章:

  • 2021-08-03
  • 2021-09-28
  • 2021-04-13
  • 2021-04-28
  • 2021-10-08
猜你喜欢
  • 2021-11-03
  • 2022-01-05
  • 2021-09-04
  • 2021-11-21
  • 2021-09-20
  • 2021-09-22
  • 2021-08-23
相关资源
相似解决方案