爬取指定主题的论文,并以相关度排序。
1 #!/usr/bin/python3 2 # -*- coding: utf-8 -*- 3 import requests 4 import linecache 5 import random 6 from bs4 import BeautifulSoup 7 8 if __name__=="__main__": 9 keywords=\'通信\' ### 查询的主题 10 n=0 11 target=\'http://search.cnki.net/search.aspx?q=\'+str(keywords)+\'&rank=relevant&cluster=all&val=CJFDTOTAL&p={}\' 12 user_agent = \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36\' 13 headers = {\'User-Agent\':user_agent} 14 for i in range(10): 15 i=i*15 16 target=target.format(i) 17 req=requests.get(url=target) 18 html=req.text 19 html=html.replace(\'<br>\',\' \').replace(\'<br/>\',\' \').replace(\'/>\',\'>\') 20 bf=BeautifulSoup(html,"html.parser") 21 texts=bf.find(\'div\',class_=\'articles\') 22 texts_div=texts.find_all(\'div\',class_=\'wz_content\') 23 for item in texts_div: 24 item_name=item.find(\'a\').text 25 item_href=item.find(\'a\')[\'href\'] 26 item_refer2=item.find(\'span\',class_=\'count\').text 27 print(\'{} {} {}\n\'.format(item_name,item_href,item_refer2)) 28 print(n)