看了别人写的抓取晋江小说的爬虫后,自己动手写了一版简单的。
记录下。
【执行脚本时只需输入想下载的文章ID即可】
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
# -*- coding:utf8 -*-
# 爬虫 - 晋江小说import requests
import lxml.html
from itertools import product
def jj_Download(chapters_url, chapters_title, novel_name):
i = 0
for u, t in product(chapters_url, chapters_title):
i += 1
if len(chapters_url) < i:
return
print(t + " 下载ing......")
html = requests.get(chapters_url[i - 1 ]).content
selector = lxml.html.fromstring(html)
content_text = selector.xpath('///div[@class="noveltext"]/text()')
name = "第" + str(i) + "章 " + t # 章节
content = '\n' + name + '\n'.join(content_text)
with open(novel_name,'a',encoding="utf-8") as f :
f.write(content)
f.write('\n')
f.close()
# 获取当前页面的所有章节地址# 晋江小说IDid = input("请输入小说novelid:")
res = requests.get(url).content
tree = lxml.html.fromstring(res)
# 获取非vip章节链接chapters_url = tree.xpath('//tr[@itemprop="chapter"]//a/@href')
# 获取全部章节标题chapters_title = tree.xpath('//tr[@itemprop="chapter"]//a/text()')
# 获取小说名novel = tree.xpath('//span[@itemprop="articleSection"]/text()')[0]
# 获取小说作者author = tree.xpath('//span[@itemprop="author"]/text()')[0]
novel_name = novel + " 作者:" + author + ".txt"
jj_Download(chapters_url, chapters_title, novel_name)
|