XPath的使用[爬取知乎发现]文件存储[txt，json，csv，mongodb]

使用XPath

 1 import requests
 2 import json
 3 from lxml import etree
 4 from urllib import parse
 5 
 6 url = \'https://www.zhihu.com/explore\'
 7 headers = {
 8     \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36\'
 9 }
10 html = requests.get(url, headers=headers).text
11 # 响应返回的是字符串，解析为HTML DOM模式 text = etree.HTML(html)
12 text = etree.HTML(html)
13 # 返回所有内容的结点位置
14 node_list = text.xpath(\'//div[@class="explore-feed feed-item"]\')
15 items ={}
16 for node in node_list:
17     # xpath返回的列表，这个列表就这一个参数，用索引方式取出来
18     #问题
19     question = node.xpath(\'.//h2/a\')[0].text.replace("\n","")
20     # 作者
21     author = node.xpath(\'.//*[@class="author-link-line"]/*\')[0].text
22     #author = "".join(node.xpath(\'.//*[@class="author-link-line"]//text()\')).replace("\n","")
23     # 回答
24     answer = node.xpath(\'.//*[@class="content"]\')[0].text
25     #answer = "".join(node.xpath(\'.//*[@class="content"]/text()\')).strip()
26     #answer = str(node.xpath(\'.//*[@class="content"]/text()\'))[1:-1]
27 
28     items = {
29         "question" : question,
30         "author" : author,
31         "answer" : answer,
32     } 
33 
34     with open("explore.json", "a") as f:
35         #f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n")
36         f.write(json.dumps(items, ensure_ascii = False) + "\n")

保存为TXT

 1 import requests
 2  3 from lxml import etree
 4 from urllib import parse
 5 
 6 url = \'https://www.zhihu.com/explore\'
 7 headers = {
 8     \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36\'
 9 }
10 html = requests.get(url, headers=headers).text
11 # 响应返回的是字符串，解析为HTML DOM模式 text = etree.HTML(html)
12 text = etree.HTML(html)
13 # 返回所有内容的结点位置
14 node_list = text.xpath(\'//div[@class="explore-feed feed-item"]\')
15 
16 for node in node_list:
17     # xpath返回的列表，这个列表就这一个参数，用索引方式取出来
18     #问题
19     question = node.xpath(\'.//h2/a\')[0].text.replace("\n","")
20     # 作者
21     author = node.xpath(\'.//*[@class="author-link-line"]/*\')[0].text
22     #author = "".join(node.xpath(\'.//*[@class="author-link-line"]//text()\')).replace("\n","")
23     # 回答
24     answer = node.xpath(\'.//*[@class="content"]\')[0].text
25     #answer = "".join(node.xpath(\'.//*[@class="content"]/text()\')).strip()
26     #answer = str(node.xpath(\'.//*[@class="content"]/text()\'))[1:-1]
27 
28     with open(\'explore.txt\', \'a\', encoding=\'utf-8\') as file:
29         file.write(\'\n\'.join([question, author, answer]))
30         file.write(\'\n\' + \'=\' * 50 + \'\n\')

保存为csv

 1 import requests
 2 from lxml import etree
 3 from urllib import parse
 4 import csv
 5 
 6 url = \'https://www.zhihu.com/explore\'
 7 headers = {
 8     \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36\'
 9 }
10 html = requests.get(url, headers=headers).text
11 # 响应返回的是字符串，解析为HTML DOM模式 text = etree.HTML(html)
12 text = etree.HTML(html)
13 # 返回所有内容的结点位置
14 node_list = text.xpath(\'//div[@class="explore-feed feed-item"]\')
15 
16 for node in node_list:
17     # xpath返回的列表，这个列表就这一个参数，用索引方式取出来
18     #问题
19     question = node.xpath(\'.//h2/a\')[0].text.replace("\n","")
20     # 作者
21     author = node.xpath(\'.//*[@class="author-link-line"]/*\')[0].text
22     #author = "".join(node.xpath(\'.//*[@class="author-link-line"]//text()\')).replace("\n","")
23     # 回答,为方便展示，只取部分内容,text[ :10]
24     answer = node.xpath(\'.//*[@class="content"]\')[0].text[ :10]
25 
26     #answer = node.xpath(\'.//*[@class="content"]\')[0].text
27     #answer = "".join(node.xpath(\'.//*[@class="content"]/text()\')).strip()
28     #answer = str(node.xpath(\'.//*[@class="content"]/text()\'))[1:-1]
29 
30 
31     with open(\'explore.csv\', \'a\', encoding=\'utf-8\') as csvfile:
32         fieldnames = [\'question\', \'author\', \'answer\']
33         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
34         writer.writeheader()
35         writer.writerow({\'question\': question, \'author\': author, \'answer\': answer})

读取csv

1 import csv
2 
3 with open(\'explore.csv\', \'r\', encoding=\'utf-8\') as csvfile:
4     reader = csv.reader(csvfile)
5     for row in reader:
6         print(row)

Excel打开乱码参考：excel打开csv文件显示乱码的处理方法_百度经验

保存到MongoDB

 1 import requests
 2 from lxml import etree
 3 from urllib import parse
 4 from pymongo import MongoClient
 5 
 6 client = MongoClient()
 7 db = client[\'explore\']
 8 collection = db[\'explore\']
 9 
10 url = \'https://www.zhihu.com/explore\'
11 headers = {
12     \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36\'
13 }
14 html = requests.get(url, headers=headers).text
15 # 响应返回的是字符串，解析为HTML DOM模式 text = etree.HTML(html)
16 text = etree.HTML(html)
17 # 返回所有内容的结点位置
18 node_list = text.xpath(\'//div[@class="explore-feed feed-item"]\')
19 
20 for node in node_list:
21     # xpath返回的列表，这个列表就这一个参数，用索引方式取出来
22     #问题
23     question = node.xpath(\'.//h2/a\')[0].text.replace("\n","")
24     # 作者
25     author = node.xpath(\'.//*[@class="author-link-line"]/*\')[0].text
26     #author = "".join(node.xpath(\'.//*[@class="author-link-line"]//text()\')).replace("\n","")
27     # 回答
28     answer = node.xpath(\'.//*[@class="content"]\')[0].text
29     #answer = "".join(node.xpath(\'.//*[@class="content"]/text()\')).strip()
30     #answer = str(node.xpath(\'.//*[@class="content"]/text()\'))[1:-1]
31 
32     items = {
33         "question" : question,
34         "author" : author,
35         "answer" : answer,
36     } 
37 
38     if collection.insert(items):
39         print(\'Saved to Mongo\')

稍微改动

 1 import requests
 2 from lxml import etree
 3 from urllib import parse
 4 from pymongo import MongoClient
 5 
 6 client = MongoClient()
 7 db = client[\'explore\']
 8 collection = db[\'explore\']
 9 
10 url = \'https://www.zhihu.com/explore\'
11 headers = {
12     \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36\'
13 }
14 html = requests.get(url, headers=headers).text
15 # 响应返回的是字符串，解析为HTML DOM模式 text = etree.HTML(html)
16 text = etree.HTML(html)
17 # 返回所有内容的结点位置
18 node_list = text.xpath(\'//div[@class="explore-feed feed-item"]\')
19 explore ={}
20 for node in node_list:
21     # xpath返回的列表，这个列表就这一个参数，用索引方式取出来
22     #问题
23     explore[\'question\'] = node.xpath(\'.//h2/a\')[0].text.replace("\n","")
24     # 作者
25     explore[\'author\'] = node.xpath(\'.//*[@class="author-link-line"]/*\')[0].text
26     #author = "".join(node.xpath(\'.//*[@class="author-link-line"]//text()\')).replace("\n","")
27     # 回答
28     explore[\'answer\'] = node.xpath(\'.//*[@class="content"]\')[0].text
29     #answer = "".join(node.xpath(\'.//*[@class="content"]/text()\')).strip()
30     #answer = str(node.xpath(\'.//*[@class="content"]/text()\'))[1:-1]
31 
32     if collection.insert(explore):
33         print(\'Saved to Mongo\')

抛出异常

raise DuplicateKeyError(error.get("errmsg"), 11000, error)

pymongo.errors.DuplicateKeyError: E11000 duplicate key error collection: explore.explore index: _id_ dup key: { : ObjectId(\'5b3792ae393e0d0c38123bbc\') }

id用于记录唯一的一条Schema，Schema本身就有标识唯一性的属性，它就是：ObjectId主键，一种特殊而且非常重要的类型，每个Schema都会默认配置这个属性，属性名为_id
MongoDB 默认会创建一个索引_id

 1 import requests
 2 from lxml import etree
 3 from urllib import parse
 4 from pymongo import MongoClient
 5 import pymongo.errors
 6 
 7 client = MongoClient()
 8 db = client[\'explore\']
 9 collection = db[\'explore\']
10 
11 url = \'https://www.zhihu.com/explore\'
12 headers = {
13     \'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36\'
14 }
15 html = requests.get(url, headers=headers).text
16 # 响应返回的是字符串，解析为HTML DOM模式 text = etree.HTML(html)
17 text = etree.HTML(html)
18 # 返回所有内容的结点位置
19 node_list = text.xpath(\'//div[@class="explore-feed feed-item"]\')
20 explore ={}
21 for node in node_list:
22     # xpath返回的列表，这个列表就这一个参数，用索引方式取出来
23     #问题
24     explore[\'question\'] = node.xpath(\'.//h2/a\')[0].text.replace("\n","")
25     # 作者
26     explore[\'author\'] = node.xpath(\'.//*[@class="author-link-line"]/*\')[0].text
27     #author = "".join(node.xpath(\'.//*[@class="author-link-line"]//text()\')).replace("\n","")
28     # 回答
29     explore[\'answer\'] = node.xpath(\'.//*[@class="content"]\')[0].text
30     #answer = "".join(node.xpath(\'.//*[@class="content"]/text()\')).strip()
31     #answer = str(node.xpath(\'.//*[@class="content"]/text()\'))[1:-1]
32     try:
33         if collection.insert(explore):
34             print(\'Saved to Mongo\')
35     except pymongo.errors.DuplicateKeyError:
36         # 对唯一字段进行重复插入，pymongo则会抛出这个错误，并且插入失败
37         print("重复插入")
38         pass

还是不能插入数据。