工具:
- requests模块
- beautifulsoup4模块
- logging模块
- lxml模块
主要流程:
- 请求url,返回页面
- 解析页面,提取数据
- 保存数据
注:在解析上我写了两种不同方式,一种是用Beautifulsoup,另一种是Xpath。源码中的请求头有删改。
源码如下:
import requests
from bs4 import BeautifulSoup
import logging
from lxml import etree
class Spider_Novel():
def __init__(self,url):
self.novel_url = url
self.session = requests.session()
self.session.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/53.36 (KHTML, like Gecko) Chrome/69.0.34.81 Safari/57.36'}
def __call__(self, *args, **kwargs):
"""
执行主函数
:param args:
:param kwargs:
:return:
"""
self.logger = logging.getLogger('novel')
self.logger.setLevel(logging.INFO)
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
format = logging.Formatter(
'时间:%(asctime)s,'
'日志级别:%(levelname)s,'
'日志信息:%(message)s,'
)
sh.setFormatter(format)
self.logger.addHandler(sh)
response = self.Doneload_html(self.novel_url)
# title, chapter_urls = self.Parse_title_chapter(response)
title, chapter_urls = self.Parse_title_chapter_xpath(response)
self.Retain(title,chapter_urls)
def Doneload_html(self,novel_url):
"""
根据url,返回页面的数据
:param novel_url:
:return:
"""
try:
response = self.session.get(novel_url)
response.encoding = 'gb2312'
response = response.text
return response
except Exception as e:
self.logger.error("返回小说主页面出错")
raise e
def Parse_title_chapter(self,response):
"""
页面解析器,从返回的页面中提取出小说名、一个二元列表,里面每个元素分别是章节名,章节url
:param response:
:return:
"""
try:
soup = BeautifulSoup(response,'lxml')
title = soup.find_all('div',attrs={'class':'fenlei'})[0].a.string
div = soup.find_all('div',class_ = 'booklist first')[0]
chapter_urls = ((i.a['title'],i.a['href']) for i in div.find_all('li'))
return title,chapter_urls
except Exception as e:
self.logger.error("小说主页面解析出错")
raise e
def Retain(self,title,chapter_urls):
"""
传来一个小说章节url的列表,请求每个url,并解析出指定内容,并保存在文件中
:param chapter_urls:
:return:
"""
with open('%s.txt'%title,'w',encoding='utf-8') as f:
for i in chapter_urls:
name = i[0]
url = i[1]
html = self.Doneload_html(url)
try:
content = self.Parse_content_xpath(html)
f.write(name)
f.write('\n')
f.write(content)
f.write('\n')
self.logger.info("%s下载成功 此文章地址%s" % (name, url))
except Exception as e:
self.logger.error("%s解析出错"%name)
self.logger.info("正本%s下载成功"%title)
def Parse_content(self,html):
"""
解释小说章节url返回的文本,提取小说内容
:return:
"""
soup = BeautifulSoup(html,'lxml')
content = soup.find_all('div',id='chapter_content')[0].stripped_strings
content = ''.join(content).replace('screen_content_set();','').replace('\r\n','').replace('readsidebar(\'1\');','')
return content
def Parse_title_chapter_xpath(self,response):
"""
页面解析器,从返回的页面中提取出小说名、一个二元列表,里面每个元素分别是章节名,章节url
:param response:
:return:
"""
try:
page = etree.HTML(response)
title = page.xpath("//div[@class='fenlei']/span/a[1]/text()")[0]
a = page.xpath("//div[@class='booklist first']/ul/li/a")
# print(a)
# exit()
chapter_urls = [(i.xpath("./text()")[0],i.xpath("./@href")[0]) for i in a]
return title,chapter_urls
except Exception as e:
self.logger.error("小说主页面解析出错")
raise e
def Parse_content_xpath(self, html):
"""
解释小说章节url返回的文本,提取小说内容
:return:
"""
page = etree.HTML(html)
content = page.xpath("//div[@id='chapter_content']/text()")[2:]
content = ''.join(x.strip() for x in content )
return content
if __name__ == '__main__':
spider = Spider_Novel('http://www.574aw.com/zhongshengzhiminyishiweitian/')
spider()