1 # coding:utf8 2 from multiprocessing.dummy import Pool as ThreadPool 3 import multiprocessing 4 import requests, os, codecs, time 5 from lxml import etree 6 7 url = \'https://www.biquge5200.cc/79_79883/\' # 要下载的小说章节列表页面url 8 9 10 def getsource(url): 11 try: 12 s = requests.get(url) 13 except: 14 print(\'访问异常,跳过~!\') 15 else: 16 s.encoding = \'gbk\' 17 return s.text 18 19 20 def getlist(url): 21 global txtname, txtzz 22 #解析地址 23 html = getsource(url) 24 ehtml = etree.HTML(html) 25 u = ehtml.xpath(\'//*[@id="list"]/dl/dd/a/@href\') 26 t = ehtml.xpath(\'//*[@id="list"]/dl/dd/a/text()\') 27 txtname = ehtml.xpath(\'//*[@id="info"]/h1/text()\')[0].replace(\'\\\', \'\').replace(\'/\', \'\').replace(\':\', \'\').replace( 28 \'*\', \'\').replace(\'?\', \'\').replace(\'"\', \'\').replace(\'<\', \'\').replace(\'>\', \'\').replace(\'|\', \'\') 29 txtzz = ehtml.xpath(\'//*[@id="info"]/p[1]/text()\')[0].replace(\'\xa0\', \'\') 30 num = 0 31 #循环urllist 32 for i in range(9, len(u)): 33 urllist.append(u[i] + \'|\' + t[i] + \'|\' + str(num)) 34 print(urllist) 35 print(u[i] + \'|\' + t[i] + \'|\' + str(num)) 36 num += 1 37 38 39 def downtxt(url): 40 global downcount 41 u = url.split(\'|\')[0] 42 t = url.split(\'|\')[1] 43 num = url.split(\'|\')[2] 44 content = \'\' 45 while len(content) == 0: 46 html = getsource(u) 47 ehtml = etree.HTML(html) 48 content = ehtml.xpath(\'string(//*[@id="content"])\').replace(\' \', \'\r\n\').replace(\' \', \'\r\n\').replace( 49 \'\xa0\', \'\').replace(\'\ufffd\', \'\').replace(\'\u266a\', \'\').replace(\'readx;\', \'\') 50 if os.path.exists(savepath + num + \'.txt\'): 51 print(num + \'.txt 已经存在!\') 52 else: 53 with codecs.open(savepath + num + \'.txt\', \'a\')as f: 54 f.write(\'\r\n\' + t + \'\r\n\' + content) 55 print(t + \' 下载完成!\') 56 downcount += 1 57 58 59 time_start = time.time(); 60 downcount = 0 61 urllist = [] 62 getlist(url) 63 savepath = os.getcwd() + \'\\\' + txtname + \'\\\' 64 if os.path.exists(savepath) == False: 65 os.makedirs(savepath) 66 pool = ThreadPool(multiprocessing.cpu_count()) 67 results = pool.map(downtxt, urllist) 68 pool.close() 69 pool.join() 70 print(\'开始合并txt...\') 71 with codecs.open(savepath + txtname + \'.txt\', \'a\')as f: 72 f.write(txtname) 73 f.write(\'\r\n\') 74 f.write(txtzz) 75 f.write(\'\r\n\') 76 for i in range(0, len(urllist)): 77 with open(savepath + str(i) + \'.txt\', "r") as fr: 78 txt = fr.read() 79 f.write(txt) 80 f.write(\'===========================\') 81 fr.close() 82 os.remove(savepath + str(i) + \'.txt\') 83 print(\'小说合并完成~!\') 84 85 print(\'\') 86 print(\'*\' * 15 + \' 任务完成,结果如下:\' + \'*\' * 15) 87 print(\'\') 88 print(\'<\' + txtname + \'> 下载完成\' + \',获取并下载章节页面:\' + str(downcount) + \' 个\') 89 print(\'\') 90 print(\'耗时:\' + str(time.time() - time_start) + \' s\') 91 print(\'\') 92 print(\'*\' * 51)