最终效果如下:
······················主程序:·······································
# -*- coding: utf-8 -*-
import scrapy
import requests
import json
from qidian.items import QidianItem
class MyqidianSpider(scrapy.Spider):
name = 'myqidian'
allowed_domains = ['qidian.com']
start_urls = ['http://www.qidian.com/all?chanId=21&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0']
def parse(self, response):
# print(response.text)
bookList = response.xpath('//ul[@class="all-img-list cf"]/li')
for i in bookList:
bookId = i.xpath('./div[@class="book-img-box"]/a/@data-bid').extract()[0]
bookUrl = 'http:'+ i.xpath('./div[@class="book-img-box"]/a/@href').extract()[0]
yield scrapy.Request(bookUrl,callback=self.get_url,meta={"bookId":bookId})#把 url , bookId 传到下一个方法
#构建翻页
page = response.xpath('//@data-pagemax)').extract_first()
page = int(page)
for i in range(2, page + 1):
url = "http://www.qidian.com/all?chanId=21&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page={}".format(
i)
yield scrapy.Request(url, callback=self.parse)
def get_url(self,response):
meta = response.meta
bookId = response.meta['bookId']
jsonurl = 'https://book.qidian.com/ajax/book/category?_csrfToken=OFmDKzipSh4trLG5YRG79dFXcFYAEZgV0cjNceDd&bookId=' + bookId
bookName = response.xpath('//div[@class="book-info "]/h1/em/text()').extract()[0]
writerName = response.xpath('//div[@class="book-info "]/h1/span/a/text()').extract()[0]
xinxi = response.xpath('//div[@class="book-intro"]/p/text()').extract()[0].strip()
meta = {
"bookName" : bookName,"writerName" : writerName, "xinxi" : xinxi
}
yield scrapy.Request(jsonurl,callback=self.get_zhangjie,meta = meta)
def get_zhangjie(self,response):
meta = response.meta
bookName = meta['bookName']
writerName = meta['writerName']
xinxi = meta['xinxi']
html = requests.get(response.url).content.decode('utf-8')
data = json.loads(html)['data']
vs = data.get('vs')
for i in vs:
cs = i.get('cs')
for i in cs:
cN = i.get('cN')
cU = i.get('cU')
curl = 'https://read.qidian.com/chapter/'+cU
uT = i.get('uT')
cnt = i.get('cnt')
meta = {
"bookName": bookName, "writerName": writerName, "xinxi": xinxi,
"cN" : cN, "curl" : curl,"uT" : uT,"cnt":cnt
}
yield scrapy.Request(curl,callback=self.Lett_text,meta = meta)
def Lett_text(self,response):
item = QidianItem()
meta = response.meta
item['bookName'] = meta['bookName']
item['writerName'] = meta['writerName']
item['xinxi'] = meta['xinxi']
item['cN'] = meta['cN']
item['curl'] = meta['curl']
item['uT'] = meta['uT']
item['cnt'] = meta['cnt']
textList = response.xpath('//div[@class="read-content j_readContent"]')
for text in textList:
text = text.xpath('//p/text()').extract()[1:]
item['text'] = ''.join(text).strip().replace('\u3000','')
yield item
··············item文件:··························
import scrapy
class QidianItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
bookName = scrapy.Field()
writerName = scrapy.Field()
xinxi = scrapy.Field()
cN = scrapy.Field()
curl = scrapy.Field()
uT = scrapy.Field()
cnt = scrapy.Field()
text = scrapy.Field()
················写入数据库
import pymysql
class QidianPipeline(object):
def __init__(self):
self.conn = None
self.cur = None
def open_spider(self, spider):
self.conn = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
password='密码',
db='pydata201806',
charset='utf8'
)
self.cur = self.conn.cursor()
def process_item(self, item, spider):
# if not hasattr(item, 'table_name'):
# return item
cols, values = zip(*item.items())
sql = "INSERT INTO `%s` (%s) VALUES (%s)" % \
(
'qidianbook',
','.join(cols),
','.join(['%s'] * len(values))
)
self.cur.execute(sql, values)
self.conn.commit()
print(self.cur._last_executed)
return item
def close_spider(self, spider):
self.cur.close()
self.conn.close()