一个很简单的爬虫。
从这里学习的,解释的挺好的:https://xlzd.me/2015/12/16/python-crawler-03
分享写这个代码用到了的学习的链接:
其他的一些东西在代码里面有详细注释。
1 # encoding = utf-8 2 import codecs 3 import requests 4 from bs4 import BeautifulSoup 5 6 # 爬取豆瓣电影TOP250 7 8 DOWNLOADURL = \'https://movie.douban.com/top250\' 9 10 def download_page(url): # 向服务器发送请求下载得到html 11 headers = {\'User-Agent\': \'Mozilla/4.0(compatibel; MSIE 5.5; Windows NT)\'} # 伪装成浏览器 12 data = requests.get(url, headers=headers).content 13 return data # 返回得到的html代码 14 15 def parse_html(html): # 解析网页 16 soup = BeautifulSoup(html, \'lxml\') # 创建一个BeautifulSoup对象 17 movie_list_soup = soup.find(\'ol\', attrs={\'class\': \'grid_view\'}) # 定位 18 19 name_list = [] 20 for movie_li in movie_list_soup.find_all(\'li\'): # 找到电影名字 21 detail = movie_li.find(\'div\', attrs={\'class\', \'hd\'}) 22 movie_name = detail.find(\'span\', attrs={\'class\', \'title\'}).getText() 23 name_list.append(movie_name) # 放到一个list里面 24 25 have_next = soup.find(\'span\', attrs={\'class\': \'next\'}).find(\'a\') # 找下一页的链接 26 if have_next: # 如果有下一页的链接不为空 27 return name_list, DOWNLOADURL + have_next[\'href\'] # 返回名字列表还有下一页的URL 28 else: 29 return name_list, None # 找不到返回None 30 31 def main(): 32 url = DOWNLOADURL 33 # with xxx.open as f 打开xxx之后finally关闭xxx 34 # codecs方便处理中文编码 35 with codecs.open(\'douban_movies\', \'wb\', encoding=\'utf-8\') as fp: 36 while url: 37 html = download_page(url) 38 name_list, url = parse_html(html) 39 #\'\n\'.join(name_list) : 以\'\n\'为分隔符将name_list所有元素合并成一个新的字符串 40 #\'{movies}\'.format(movies=\'xxxxx\') : 即movies = \'xxxxx\',将xxxxx按照格式输出 41 fp.write(u\'{movies}\n\'.format(movies=\'\n\'.join(name_list))) 42 43 if __name__ == \'__main__\': 44 main()