豆瓣里可以搜到评分前250的电影。https://movie.douban.com/top250 ,可是分成了25页,我只能一页一页的看。想要一次看完250条,因此有了下面使用xpath的爬虫,把250条电影数据放进csv文件:
1 import requests 2 import lxml.html 3 import csv 4 5 doubanUrl = \'https://movie.douban.com/top250?start={}&filter=\' #{}用于匹配参数 6 7 def getSource(url): 8 \'\'\' 9 获取网页源代码。 10 :param url: 11 :return: String 12 \'\'\' 13 content = requests.get(url) 14 content.encoding = \'utf-8\' #万能编码 15 #强制修改编码,防止Windows下出现乱码 16 return content.content 17 18 19 def getEveryItem(source): 20 \'\'\' 21 获取每一个电影的相关信息。movie_dict字典用于保存电影的信息。 22 :param source: 23 :return: [movie1_dict, movie2_dict, movie3_dict,...] 24 \'\'\' 25 selector = lxml.html.document_fromstring(source) 26 movieItemList= selector.xpath(\'//div[@class="info"]\') #如果不从html开始都需要// 此处使用到了先抓大再抓小的技巧 27 movieList = [] 28 29 30 for eachMoive in movieItemList: 31 movieDict = {} 32 title = eachMoive.xpath(\'div[@class="hd"]/a/span[@class="title"]/text()\') 33 print(title) 34 otherTitle = eachMoive.xpath(\'div[@class="hd"]/a/span[@class="other"]/text()\') 35 36 link = eachMoive.xpath(\'div[@class="hd"]/a/@href\')[0] 37 print(\'看看多少a标签\') 38 print(eachMoive.xpath(\'div[@class="hd"]/a/@href\')[:]) 39 40 41 42 directorAndActor = eachMoive.xpath(\'div[@class="bd"]/p[@class=""]/text()\') 43 star = eachMoive.xpath(\'div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()\')[0] 44 quote = eachMoive.xpath(\'div[@class="bd"]/p[@class="quote"]/span/text()\') 45 if quote: 46 quote = quote[0] 47 else: 48 quote = \'\' 49 50 movieDict[\'title\'] = \'\'.join(title + otherTitle) 51 movieDict[\'url\'] = link 52 #你可以试一试直接打印\'\'.join(directorAndActor),看看他的格式是多么的混乱 53 movieDict[\'directorAndActor\'] = \'\'.join(directorAndActor).replace(\' \', \'\').replace(\'\r\', \'\').replace(\'\n\', \'\') #可以同时多个替换 54 movieDict[\'star\'] = star 55 movieDict[\'quote\'] = quote 56 57 movieList.append(movieDict) #把抓到的每个电影房进集合里 58 return movieList 59 60 def writeData(movieList): 61 with open(\'doubanMovie_example2.csv\', \'w\', encoding=\'UTF-8\', newline=\'\') as f: #需要新建个doubanMovie_example2.csv文件 62 writer = csv.DictWriter(f, fieldnames=[\'title\', \'directorAndActor\', \'star\', \'quote\', \'url\']) 63 writer.writeheader() 64 for each in movieList: 65 print(each) 66 writer.writerow(each) 67 68 if __name__ == \'__main__\': 69 movieList = [] 70 for i in range(10): 71 pageLink = doubanUrl.format(i * 25) #给url格式化 每次拿25条 从第一页开始即i为0开始 ,世事无绝对还得看爬虫网址的规律。 72 print(pageLink) 73 source = getSource(pageLink) 74 movieList += getEveryItem(source) 75 print(movieList[:10]) #把十页数据即250条拿来看看 76 movieList = sorted(movieList, key=lambda k: k[\'star\'], reverse=True) #根据字典中的key=star这一项的value来排序,倒序。 77 writeData(movieList)
显示结果:
1、控制台显示
2、csv文件,用excel打开,可能中文出现乱码,这时用记事本打开并且另存为utf-8文件,再打开就好了。