1 # -*- coding: utf-8 -*- 2 """ 3 Created on Wed Oct 12 16:48:33 2016 4 5 @author: fuzzier 6 """ 7 8 import requests 9 from bs4 import BeautifulSoup 10 import re 11 import os 12 import codecs 13 14 URL = \'http://www.xxxxx.net\' 15 16 def download_page(url): 17 headers = {\'User_Agent\':\'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1581.2 Safari/537.36\'} 18 html = requests.get(url,headers=headers).content 19 return html 20 21 def parser_html(data): 22 soup = BeautifulSoup(data,\'html.parser\') 23 films = [] 24 trs = soup.find(\'div\',class_=\'bd3rl\').find(\'div\',class_=\'co_content8\').find_all(\'tr\') 25 for i in trs: 26 tr = i.find(\'a\',href=re.compile(r\'/\w+?/\w+?/\w+?/\d+?/\d+?.html\')).string 27 if tr: 28 films.append(tr) 29 else: 30 films.append(\'None\') 31 return films 32 33 if __name__ == \'__main__\': 34 html = download_page(URL) 35 film_list = parser_html(html) 36 with codecs.open(os.getcwd()+\'\\dytt8_hot.txt\',\'w\',encoding=\'utf8\') as f: 37 for i in film_list: 38 f.write(i+\'\r\n\')
相关文章: