#`解决 python 爬虫时,CSV保存乱码情况(以豆瓣名著信息为例)
我这是通过学习一位大神的博客来实现的,他的博客为
https://blog.csdn.net/weixin_43901998/article/details/8835771
豆瓣名著网址如下:
https://book.douban.com/tag/名著
不多说,上代码
import requests
import csv
from lxml import etree
import codecs #可以提前以“ab+”的方式打开文件
with open("G:\爬虫\豆瓣名著排名1.csv","ab+") as fp:
fp.write(codecs.BOM_UTF8)#,第一次打开,这为了防止在Windows下打开CSV文件出现乱码
fp = open("G:\爬虫\豆瓣名著排名1.csv",'a+',newline='',encoding='utf-8')#第二次打开,开始写入数据
writer = csv.writer(fp)
writer.writerow(['书名','链接','作者','出版社','日期','价格','评分'])
headers ={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36",
"referer":"https://book.douban.com/tag/%E5%90%8D%E8%91%97?start=20&type=T"
}
urls =["https://book.douban.com/tag/%E5%90%8D%E8%91%97?start={}".format(i) for i in range(0,20*84,20)]
# 看网页源代码可分析出,总共有84页,start从0开始,每进一页+20,
for url in urls:
r = requests.get(url,headers=headers)
html = etree.HTML(r.text)
infos = html.xpath('//div[@class="info"]')
for info in infos: #开始通过xpath来获取对应信息
name = info.xpath('h2/a/text()')[0]
url_ = info.xpath('h2/a/@href')[0]
authors = info.xpath('//*[@id="subject_list"]/ul/li[1]/div[2]/div[1]/text()')[0]
author = authors.split('/')[0]
publish = authors.split('/')[1]
dath = authors.split('/')[-2]
price = authors.split('/')[-1]
score = html.xpath('//*[@id="subject_list"]/ul/li[1]/div[2]/div[2]/span[2]/text()')
writer.writerow([name,url_,author,publish,dath,price,score])
fp.close()
#最后,得关闭文件