任务目标: 根据CVPR论文生成热点词汇云图
任务分解:
- 爬取CVPR2019论文到数据库中;
- 分析、查找出关键词,并排序;
- 生成热词汇云图;
- 点击热词云中的热词可以找到与之对应的文章题目;
一:Python爬虫
分析网站源代码,利用Python爬取数据,然后存储到Mysql数据库中。爬取时将题目拆解成关键词,也可以直接将摘要拆解。
import requests import pymysql from bs4 import BeautifulSoup db = pymysql.connect('127.0.0.1', port=3306, user='root', password='123456', db='mytest', charset='utf8') cursor = db.cursor() headers={ "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" } url="http://openaccess.thecvf.com/CVPR2019.py" html=requests.get(url) soup=BeautifulSoup(html.content,'html.parser') soup.a.contents=='pdf' pdfs=soup.findAll(name="a",text="pdf") lis = [] jianjie="" for i,pdf in enumerate(pdfs): pdf_name=pdf["href"].split('/')[-1] name=pdf_name.split('.')[0].replace("_CVPR_2019_paper","") link="http://openaccess.thecvf.com/content_CVPR_2019/html/"+name+"_CVPR_2019_paper.html" url1=link html1 = requests.get(url1) soup1 = BeautifulSoup(html1.content, 'html.parser') weizhi = soup1.find('div', attrs={'id':'abstract'}) if weizhi: jianjie =weizhi.get_text(); print("这是第"+str(i)+"条数据") keyword = str(name).split('_') keywords = '' for k in range(len(keyword)): if (k == 0): keywords += keyword[k] else: keywords += ',' + keyword[k] info = {} info['title'] = name info['link'] =link info['abstract']=jianjie info['keywords']=keywords lis.append(info) cursor = db.cursor() for i in range(len(lis)): cols = ", ".join('`{}`'.format(k) for k in lis[i].keys()) print(cols) # '`name`, `age`' val_cols = ', '.join('%({})s'.format(k) for k in lis[i].keys()) print(val_cols) # '%(name)s, %(age)s' sql = "insert into lunwen(%s) values(%s)" res_sql = sql % (cols, val_cols) print(res_sql) cursor.execute(res_sql, lis[i]) # 将字典a传入 db.commit() num=1 print(num) print("ok")