功能描述:爬取东方财富网和百度股票的信息并将信息存在文件中
程序设计:
- 爬取东方财富网的股票信息,并将股票代码存在列表中
- 根据股票代码列表,爬取百度股票的详细信息存在字典中
- 将股票信息字典存在文件中
import re # import traceback import requests import bs4 from bs4 import BeautifulSoup def getHTMLText(url): try: r=requests.get(url) r.raise_for_status() r.encoding=r.raise_for_status() return r.text except: return "" def getStockList(stock_list_url,slt): demo=getHTMLText(stock_list_url) soup=BeautifulSoup(demo,"html.parser") for a in soup.find_all(\'a\',attrs={\'target\':\'_blank\'}): try: if isinstance(a,bs4.element.Tag): match=re.search(r\'[s][hz]\d{6}\',a.attrs.get("href")) slt.append(match.group(0)) except: continue return "" def getStockInfo(stock_info_url,slt,file_path): count=0 for stock in slt: info_dict={} #存放股票信息 url=stock_info_url+stock+\'.html\' try: demo=getHTMLText(url) soup=BeautifulSoup(demo,\'html.parser\') info_dict[\'股票名称\']=soup.find(\'a\',\'bets-name\').text.split()[0] #text:获取改标签下的所有字符串并使用空格分隔 info_div=soup.find(\'div\',\'bets-content\') dt_list=info_div.find_all(\'dt\') #ksy dd_list=info_div.find_all(\'dd\') #value for i,dt in enumerate(dt_list): info_dict[dt.string]=dd_list[i].string with open(file_path,"a") as f: f.write(str(info_dict)+\'\n\') count+=1 print(\'\r当前进度:{:.2f}%\'.format(count*100/len(slt)),end=\'\') #打印进度条 except: # traceback.print_exc() continue return "" def main(): stock_list_url=\'http://quote.eastmoney.com/stock_list.html\' #东方财富网 stock_info_url=\'https://gupiao.baidu.com/stock/\' #百度股票 slt=[] #存放股票代码列表 file_path="D://股票爬虫.txt" getStockList(stock_list_url,slt) getStockInfo(stock_info_url,slt,file_path) main()
效果显示: