# -*- coding: utf-8 -*- import scrapy from ..items import QutoutiaoItem import json import re from ..settings import CATEGORY_INFO, LIST_LIMIT class QutoutiaoSpider(scrapy.Spider): name = \'qutoutiao\' #allowed_domains = [\'qutoutiao.net\'] start_urls = [] # 各类小标题 categoryInfo = CATEGORY_INFO limit = LIST_LIMIT for value in categoryInfo: url = BASE_API + "cid=%s&tn=1&page=1&limit=%s" % ( str(value[\'cid\']), str(limit)) start_urls.append(url) def parse(self, response): response_url = response.url # 分类id从url获取一次 searchObj = re.search(r\'(.*)cid=(\d+)\', response_url) cid = searchObj and searchObj.group(2) or 0 data = json.loads(response.text)[\'data\'][\'data\'] for value in data: # 初始化模型对象 item = QutoutiaoItem() # 来源 item[\'source_name\'] = value[\'source_name\'] # 标题 item[\'title\'] = value[\'title\'] # 详细页url url = item[\'url\'] = value[\'url\'] # url = url[0:url.find(\'?\')] # 简介 item[\'introduction\'] = value[\'introduction\'] # 封面图 item[\'cover\'] = value[\'cover\'] # 发布时间 item[\'publish_time\'] = value[\'publish_time\'] # 分类 item[\'cid\'] = cid # 爬取详情页 yield scrapy.Request(url=item[\'url\'], meta={\'meta_item\': item}, callback=self.detail_parse) # 详情页 def detail_parse(self, response): # 提取每次Response的meta数据 meta_item = response.meta[\'meta_item\'] # 取内容 content_selector = response.xpath(\'//div[@class="content"]\') meta_item[\'content_images\'] = content_selector.xpath( \'//img/@src|//img/@data-src\').extract() meta_item[\'content\'] = content_selector.extract()[0] yield meta_item # 列表API BASE_API = \'http://api.1sapp.com/content/outList?\' # 爬取地址
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class QutoutiaoItem(scrapy.Item): # define the fields for your item here like: # 文章id aid = scrapy.Field() # 来源 source_name = scrapy.Field() # 标题 title = scrapy.Field() # 详细页url url = scrapy.Field() # 简介 introduction = scrapy.Field() # 封面图 cover = scrapy.Field() # 发布时间 publish_time = scrapy.Field() # 分类ID cid = scrapy.Field() # 内容 content = scrapy.Field() # 内容-中的图片 content_images = scrapy.Field()
# -*- coding: utf-8 -*- # Define here the models for your spider middleware # # See documentation in: # http://doc.scrapy.org/en/latest/topics/spider-middleware.html from scrapy import signals class QutoutiaoSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def spider_opened(self, spider): spider.logger.info(\'Spider opened: %s\' % spider.name) from fake_useragent import UserAgent import logging class UserAgent_CookiesMiddleware(object): # 随机更换user-agent def __init__(self, crawler): super(UserAgent_CookiesMiddleware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random") self.logger = logging.getLogger(__name__) @classmethod def from_crawler(cls, crawler): return cls(crawler) def process_request(self, request, spider): def get_ua(): return getattr(self.ua, self.ua_type) random_agent = get_ua() if random_agent: # 记录 request.headers[\'User-Agent\'] = random_agent request.headers[\'Accept\'] = \'application/json, text/javascript, */*; q=0.01\' request.headers[\'Origin\'] = \'http://home.qutoutiao.net\' request.headers[\'Referer\'] = \'http://home.qutoutiao.net/pages/home.html\' self.logger.debug(\'Current UserAgent: \' + random_agent)
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don\'t forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import scrapy from scrapy.utils.project import get_project_settings from scrapy.pipelines.images import ImagesPipeline import os from .qttutils import QttUtils # 封面下载 class CoverImagePipeline(ImagesPipeline): # 获取settings中的常量 IMAGES_STORE = get_project_settings().get(\'IMAGES_STORE\') # 下载图片 def get_media_requests(self, item, info): cover_images = item[\'cover\'] if cover_images: for image_url in cover_images: yield scrapy.Request(url=image_url) # 下载完成 def item_completed(self, results, item, info): # print(\'*\'*20,results,item,info) image_path = [x[\'path\'] for ok, x in results if ok] # 获取自定义存储路径 store_path = QttUtils.getStorePath() coverImages = [] # 将图片移动到新的路径 print(\'------------------image_path-\',image_path) print(\'-----------------type(image_path)--\', type(image_path)) if image_path: for image_url in image_path: file_name = os.path.split(str(image_url)) print(\'------------------file_name-\', file_name) print(\'------------------file_name-\', type(file_name)) new_image = store_path + \'/\' + file_name[1] coverImages.append(new_image) os.rename(self.IMAGES_STORE + \'/\' + image_url, new_image) item[\'cover\'] = coverImages return item # 内容图片下载 class ContentImagePipeline(ImagesPipeline): # 获取settings中的常量 IMAGE_STORE = get_project_settings().get(\'IMAGES_STORE\') # 下载图片 def get_media_requests(self, item, info): content_images = item[\'content_images\'] if content_images: for image_url in content_images: yield scrapy.Request(image_url) # 下载完成 def item_completed(self, results, item, info): image_path = [x[\'path\'] for ok, x in results if ok] print(\'---------------------image_path\', image_path) # 获取自定义存储路径 store_path = QttUtils.getStorePath() contentImages = [] # 将图片移动到新的路径 if image_path: for base_path in image_path: print(\'----------------value\', base_path) file_name = os.path.split(str(base_path)) new_image = store_path + "/" + file_name[1] contentImages.append(new_image) os.rename(self.IMAGE_STORE + "/" + base_path, new_image) item[\'content_images\'] = contentImages return item # # 下载完成 方法一 # def item_completed(self, results, item, info): # image_info = [(x[\'path\'], x[\'url\']) for ok, x in results if ok] # print(\'---------------------image_info\', image_info) # # 获取自定义存储路径 # store_path = QttUtils.getStorePath() # contentImages = [] # # 将图片移动到新的路径 # if image_info: # for value in image_info: # print(\'----------------value\', value) # image_url = value[0] # image_source = value[1] # # file_name = os.path.split(str(image_url)) # new_image = store_path + "/" + file_name[1] # contentImages.append((new_image, image_source)) # os.rename(self.IMAGE_STORE + "/" + image_url, new_image) # item[\'content_images\'] = contentImages # return item import json from .qttutils import QttUtils class QutoutiaoPipeline(object): def __init__(self): # 获取自定义的存储路径 store_path = QttUtils.getStorePath() json_path = store_path + \'/\' + \'qutoutiao.json\' self.filename = open(json_path, \'wb\') def process_item(self, item, spider): text = json.dumps(dict(item), ensure_ascii=False) + \'\n\' self.filename.write(text.encode(\'utf-8\')) return item def close_spider(self, spider): self.filename.close()
# -*- coding: utf-8 -*- # @Time : 2018-6-1 11:01 # @Author : Amir # @Site : # @File : qttutils.py # @Software: PyCharm \'\'\' 趣头条工具类 \'\'\' import time import os import shutil from .settings import DATA_PATH class QttUtils: # 获取存储路径 # # @param [string] action [remove删除目录,默认create] # @return [string] path/year/month/day/* @staticmethod def getStorePath(action=\'create\'): localtimes = time.localtime() year = time.strftime("%Y", localtimes) month = time.strftime(\'%m\', localtimes) day = time.strftime(\'%d\', localtimes) store_path = DATA_PATH + "/%s/%s/%s"%(year,month,day) # 删除目录 if os.path.exists(store_path) and action == \'remove\': shutil.rmtree(store_path) # 创建多级目录 if not os.path.exists(store_path) and action == \'create\': os.makedirs(store_path) return store_path
# -*- coding: utf-8 -*- # Scrapy settings for QuTouTiao project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = \'QuTouTiao\' SPIDER_MODULES = [\'QuTouTiao.spiders\'] NEWSPIDER_MODULE = \'QuTouTiao.spiders\' #列表记录数 LIST_LIMIT = 10 # 储存路径 DATA_PATH = r\'./data\' IMAGES_STORE = r\'./image\' #分类 CATEGORY_INFO = [ {"cid":255,"name":"推荐"}, {"cid":1,"name":"热点"}, {"cid":6,"name":"娱乐"}, {"cid":5,"name":"养生"}, {"cid":2,"name":"搞笑"}, {"cid":7,"name":"科技"}, {"cid":8,"name":"生活"}, {"cid":10,"name":"财经"}, {"cid":9,"name":"汽车"}, ] # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = \'QuTouTiao (+http://www.yourdomain.com)\' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # \'Accept\': \'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\', # \'Accept-Language\': \'en\', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # \'QuTouTiao.middlewares.QutoutiaoSpiderMiddleware\': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { \'QuTouTiao.middlewares.UserAgent_CookiesMiddleware\': 299, } # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # \'scrapy.extensions.telnet.TelnetConsole\': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { \'QuTouTiao.pipelines.QutoutiaoPipeline\': 300, \'QuTouTiao.pipelines.ContentImagePipeline\': 301, \'QuTouTiao.pipelines.CoverImagePipeline\': 302 } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = \'httpcache\' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = \'scrapy.extensions.httpcache.FilesystemCacheStorage\'