小说爬取

这里爬取的是笔趣阁小说圣墟
注释已经很详细
可以用re也可以用xpath,
代码:

import requests
import re
from lxml import etree


class Novel:
    def __init__(self):
        self.switch = True
    
    def get_Chapter_url(self):
        """
        获取每一章节对应的url
        """
        
        url = "http://www.xbiquge.la/13/13959/"
        
        #headers = {"User-Agent":  " Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"}
        html = requests.get(url)
        html.encoding = \'utf-8\'
        content = html.text
        
        #解析HTML文档xml dom模型
        xml = etree.HTML(content)
        #返回所有匹配到的,章节链接
        link_list = xml.xpath(\'//*[@id="list"]/dl/dd/a/@href\')
        #pattern = re.compile("<dd><a href=(.*?).html>第",re.S)
        #content_list = pattern.findall(content)
        link_tittle = xml.xpath(\'//*[@id="list"]/dl/dd/a/text()\')
        #print(link_tittle)
        for con in link_list[:10]:
            print(con)
            self.get_novel_content(con)


    def get_novel_content(self,con):
        """
        从对应的url中爬取每一章节对应的小说内容
        """
        full_url = "http://www.xbiquge.la" + con
        print(full_url)
        html = requests.get(full_url)
        
        html.encoding = \'utf-8\'
        content = html.text
        xml = etree.HTML(content)
        txt_list = xml.xpath(\'//*[@id="content"]/text()\')
        #print(link_list)
        # pattern = re.compile(\'<div\sid="content">(.*?)</div>\',re.S)
        # txt_list = pattern.findall(content)
        #print(txt_list)
        for con in txt_list:
            #print(con)
            self.writePage(con)
        print("保存成功!!")
        self.writePage("\n""\n""\n""\n""\n""----------------"+"下一章"+"--------------")



    def writePage(self,content):
        """
        保存每一章节的内容
        """
        with open("小说——圣墟.txt","a",encoding = "utf8") as f:
            f.write(content)
            f.close()
     

if __name__ == "__main__":
    novel = Novel()
    novel.get_Chapter_url()

在这里插入图片描述

相关文章: