scrapy爬取趣头条

# -*- coding: utf-8 -*-
import scrapy
from ..items import QutoutiaoItem
import json
import re
from ..settings import CATEGORY_INFO, LIST_LIMIT



class QutoutiaoSpider(scrapy.Spider):

    name = \'qutoutiao\'
    #allowed_domains = [\'qutoutiao.net\']
    start_urls = []
    # 各类小标题
    categoryInfo = CATEGORY_INFO
    limit = LIST_LIMIT
    for value in categoryInfo:
        url = BASE_API + "cid=%s&tn=1&page=1&limit=%s" % (
        str(value[\'cid\']), str(limit))
        start_urls.append(url)


    def parse(self, response):

        response_url = response.url
        # 分类id从url获取一次
        searchObj = re.search(r\'(.*)cid=(\d+)\', response_url)
        cid = searchObj and searchObj.group(2) or 0

        data = json.loads(response.text)[\'data\'][\'data\']

        for value in data:
            # 初始化模型对象
            item = QutoutiaoItem()
            # 来源
            item[\'source_name\'] = value[\'source_name\']
            # 标题
            item[\'title\'] = value[\'title\']
            # 详细页url
            url = item[\'url\'] = value[\'url\']
            # url = url[0:url.find(\'?\')]
            # 简介
            item[\'introduction\'] = value[\'introduction\']
            # 封面图
            item[\'cover\'] = value[\'cover\']
            # 发布时间
            item[\'publish_time\'] = value[\'publish_time\']
            # 分类
            item[\'cid\'] = cid

            # 爬取详情页
            yield scrapy.Request(url=item[\'url\'], meta={\'meta_item\': item},
                                 callback=self.detail_parse)

    # 详情页
    def detail_parse(self, response):
        # 提取每次Response的meta数据
        meta_item = response.meta[\'meta_item\']
        # 取内容
        content_selector = response.xpath(\'//div[@class="content"]\')
        meta_item[\'content_images\'] = content_selector.xpath(
            \'//img/@src|//img/@data-src\').extract()
        meta_item[\'content\'] = content_selector.extract()[0]
        yield meta_item





    # 列表API
    BASE_API = \'http://api.1sapp.com/content/outList?\'

    # 爬取地址

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class QutoutiaoItem(scrapy.Item):
    # define the fields for your item here like:

    # 文章id
    aid = scrapy.Field()
    # 来源
    source_name = scrapy.Field()
    # 标题
    title = scrapy.Field()
    # 详细页url
    url = scrapy.Field()
    # 简介
    introduction = scrapy.Field()
    # 封面图
    cover = scrapy.Field()
    # 发布时间
    publish_time = scrapy.Field()
    # 分类ID
    cid = scrapy.Field()
    # 内容
    content = scrapy.Field()
    # 内容-中的图片
    content_images = scrapy.Field()

# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class QutoutiaoSpiderMiddleware(object):
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, dict or Item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Response, dict
        # or Item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info(\'Spider opened: %s\' % spider.name)


from fake_useragent import UserAgent
import logging


class UserAgent_CookiesMiddleware(object):
    # 随机更换user-agent
    def __init__(self, crawler):
        super(UserAgent_CookiesMiddleware, self).__init__()
        self.ua = UserAgent()
        self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
        self.logger = logging.getLogger(__name__)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        def get_ua():
            return getattr(self.ua, self.ua_type)

        random_agent = get_ua()

        if random_agent:
            # 记录
            request.headers[\'User-Agent\'] = random_agent
            request.headers[\'Accept\'] = \'application/json, text/javascript, */*; q=0.01\'
            request.headers[\'Origin\'] = \'http://home.qutoutiao.net\'
            request.headers[\'Referer\'] = \'http://home.qutoutiao.net/pages/home.html\'

            self.logger.debug(\'Current UserAgent: \' + random_agent)

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don\'t forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


import scrapy
from scrapy.utils.project import get_project_settings
from scrapy.pipelines.images import ImagesPipeline
import os
from .qttutils import QttUtils


# 封面下载
class CoverImagePipeline(ImagesPipeline):
    # 获取settings中的常量
    IMAGES_STORE = get_project_settings().get(\'IMAGES_STORE\')
    # 下载图片
    def get_media_requests(self, item, info):
        cover_images = item[\'cover\']
        if cover_images:
            for image_url in cover_images:
                yield scrapy.Request(url=image_url)

    # 下载完成
    def item_completed(self, results, item, info):
        # print(\'*\'*20,results,item,info)
        image_path = [x[\'path\'] for ok, x in results if ok]
        # 获取自定义存储路径
        store_path = QttUtils.getStorePath()
        coverImages = []
        # 将图片移动到新的路径
        print(\'------------------image_path-\',image_path)
        print(\'-----------------type(image_path)--\', type(image_path))
        if image_path:
            for image_url in image_path:
                file_name = os.path.split(str(image_url))
                print(\'------------------file_name-\', file_name)
                print(\'------------------file_name-\', type(file_name))
                new_image = store_path + \'/\' + file_name[1]
                coverImages.append(new_image)
                os.rename(self.IMAGES_STORE + \'/\' + image_url, new_image)
        item[\'cover\'] = coverImages
        return item

# 内容图片下载
class ContentImagePipeline(ImagesPipeline):
    # 获取settings中的常量
    IMAGE_STORE = get_project_settings().get(\'IMAGES_STORE\')
    # 下载图片
    def get_media_requests(self, item, info):
        content_images = item[\'content_images\']
        if content_images:
            for image_url in content_images:
                yield scrapy.Request(image_url)

    # 下载完成
    def item_completed(self, results, item, info):
        image_path = [x[\'path\'] for ok, x in results if ok]
        print(\'---------------------image_path\', image_path)
        # 获取自定义存储路径
        store_path = QttUtils.getStorePath()
        contentImages = []
        # 将图片移动到新的路径
        if image_path:
            for base_path in image_path:
                print(\'----------------value\', base_path)
                file_name = os.path.split(str(base_path))
                new_image = store_path + "/" + file_name[1]
                contentImages.append(new_image)
                os.rename(self.IMAGE_STORE + "/" + base_path, new_image)
        item[\'content_images\'] = contentImages
        return item

    # # 下载完成 方法一
    # def item_completed(self, results, item, info):
    #     image_info = [(x[\'path\'], x[\'url\']) for ok, x in results if ok]
    #     print(\'---------------------image_info\', image_info)
    #     # 获取自定义存储路径
    #     store_path = QttUtils.getStorePath()
    #     contentImages = []
    #     # 将图片移动到新的路径
    #     if image_info:
    #         for value in image_info:
    #             print(\'----------------value\', value)
    #             image_url = value[0]
    #             image_source = value[1]
    #
    #             file_name = os.path.split(str(image_url))
    #             new_image = store_path + "/" + file_name[1]
    #             contentImages.append((new_image, image_source))
    #             os.rename(self.IMAGE_STORE + "/" + image_url, new_image)
    #     item[\'content_images\'] = contentImages
    #     return item

import json
from .qttutils import QttUtils


class QutoutiaoPipeline(object):

    def __init__(self):
        # 获取自定义的存储路径
        store_path = QttUtils.getStorePath()
        json_path = store_path + \'/\' + \'qutoutiao.json\'
        self.filename = open(json_path, \'wb\')

    def process_item(self, item, spider):
        text = json.dumps(dict(item), ensure_ascii=False) + \'\n\'
        self.filename.write(text.encode(\'utf-8\'))
        return item

    def close_spider(self, spider):
        self.filename.close()

# -*- coding: utf-8 -*-
# @Time    : 2018-6-1 11:01
# @Author  : Amir
# @Site    : 
# @File    : qttutils.py
# @Software: PyCharm

\'\'\'
趣头条工具类
\'\'\'

import time
import os
import shutil
from .settings import DATA_PATH

class QttUtils:
    # 获取存储路径
    #
    # @param  [string] action [remove删除目录，默认create]
    # @return [string] path/year/month/day/*
    @staticmethod
    def getStorePath(action=\'create\'):
        localtimes = time.localtime()
        year = time.strftime("%Y", localtimes)
        month = time.strftime(\'%m\', localtimes)
        day = time.strftime(\'%d\', localtimes)
        store_path = DATA_PATH + "/%s/%s/%s"%(year,month,day)

        # 删除目录
        if os.path.exists(store_path) and action == \'remove\':
            shutil.rmtree(store_path)

        # 创建多级目录
        if not os.path.exists(store_path) and action == \'create\':
            os.makedirs(store_path)

        return store_path

# -*- coding: utf-8 -*-

# Scrapy settings for QuTouTiao project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = \'QuTouTiao\'

SPIDER_MODULES = [\'QuTouTiao.spiders\']
NEWSPIDER_MODULE = \'QuTouTiao.spiders\'


#列表记录数
LIST_LIMIT = 10

# 储存路径
DATA_PATH = r\'./data\'
IMAGES_STORE = r\'./image\'

#分类
CATEGORY_INFO = [
    {"cid":255,"name":"推荐"},
    {"cid":1,"name":"热点"},
    {"cid":6,"name":"娱乐"},
    {"cid":5,"name":"养生"},
    {"cid":2,"name":"搞笑"},
    {"cid":7,"name":"科技"},
    {"cid":8,"name":"生活"},
    {"cid":10,"name":"财经"},
    {"cid":9,"name":"汽车"},
]

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = \'QuTouTiao (+http://www.yourdomain.com)\'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   \'Accept\': \'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\',
#   \'Accept-Language\': \'en\',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    \'QuTouTiao.middlewares.QutoutiaoSpiderMiddleware\': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   \'QuTouTiao.middlewares.UserAgent_CookiesMiddleware\': 299,
}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    \'scrapy.extensions.telnet.TelnetConsole\': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    \'QuTouTiao.pipelines.QutoutiaoPipeline\': 300,
    \'QuTouTiao.pipelines.ContentImagePipeline\': 301,
    \'QuTouTiao.pipelines.CoverImagePipeline\': 302
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = \'httpcache\'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = \'scrapy.extensions.httpcache.FilesystemCacheStorage\'