一、选题背景
现如今,手机APP品类繁多。不同种类的软件可以实现不同的功能。为此,根据豌豆荚应用市场排行榜做出分析,通过下载量来分析出来受欢迎的软件。
二、爬虫设计方案
爬虫名称:豌豆荚排行榜爬虫
内容与数据特征分析:通过获取网页界面获取想要的数据。
设计方案:request,etree进行网页分析,xpath获取想要爬取的数据,sys进行文件保存。
技术难点:xpath格式转换。
三、主题页面结构特征分析
页面的结构特征分析:内容导航型
Htmls页面分析:
标题、简介:
下载数量:
软件大小、分类:
节点查找、遍历:
查找:
sf_name = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/h2/a/text()".format( coun )) sf_download = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[1]/span[1]/text()".format( coun )) sf_size = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[1]/span[3]/text()".format( coun )) sf_classify = html.xpath("//*[@id=\'j-top-list\']/li[{}]/a[1]/text()".format( coun )) sf_synopsis = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[2]/text()".format( coun ))
遍历:通过循环进行提取
四、网络爬虫程序设计
数据爬取与采集:
1 import requests 2 from bs4 import BeautifulSoup 3 import time 4 import random 5 import sys 6 import re 7 from tqdm import tqdm 8 from lxml import etree 9 10 USER_AGENTS = [ 11 \'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36\' 12 \'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36\' 13 \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36\' 14 \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36\' 15 \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36\' 16 \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36\' 17 \'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36\' 18 \'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36\' 19 \'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36\' 20 \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36\' 21 \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36\' 22 \'Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3\' 23 \'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0\' 24 \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0\' 25 \'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0\' 26 \'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0\' 27 \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0\' 28 \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0\' 29 \'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0\' 30 \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0\' 31 \'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0\' 32 \'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0\' 33 \'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1\' 34 \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1\' 35 \'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0\' 36 \'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0\' 37 ] 38 39 headers = { 40 \'User-Agent\':random.choice(USER_AGENTS), 41 # \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0\', 42 \'Connection\':\'keep-alive\', 43 \'Accept-Language\':\'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2\' 44 } 45 46 47 # 软件排行爬虫 48 def Wdj_sf(): 49 # 创建文件 50 file = open("sf_pop.csv", "a") 51 file.write( "sf_name" + "," + "sf_download" + "," + "sf_size" + "," + "sf_classify" + "," + "sf_synopsis" + \'\n\') 52 file = file.close() 53 # 请求访问 54 url = \'https://www.wandoujia.com/top/app\' 55 res = requests.get(url,headers=headers) 56 res.encoding = \'utf-8\' 57 # soup = BeautifulSoup(res.text,\'lxml\') 58 # print(soup) 59 html = etree.HTML(res.text) 60 # print(html) 61 #获取标签内容:软件名sf_name、下次次数sf_download、软件大小sf_size、软件分类sf_classify、软件简介sf_synopsis 62 coun = 1 63 for i in range(1,25): 64 try: 65 sf_name = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/h2/a/text()".format( coun )) 66 for i in sf_name: 67 sf_name = i 68 sf_download = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[1]/span[1]/text()".format( coun )) 69 for i in sf_download: 70 sf_download = i.strip(\'万亿人下载\') 71 sf_download = float(sf_download) 72 if sf_download > 100: 73 sf_download = sf_download/10000 74 sf_download = round(sf_download, 2) 75 sf_size = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[1]/span[3]/text()".format( coun )) 76 for i in sf_size: 77 sf_size = i 78 sf_classify = html.xpath("//*[@id=\'j-top-list\']/li[{}]/a[1]/text()".format( coun )) 79 for i in sf_classify: 80 sf_classify = i 81 sf_synopsis = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[2]/text()".format( coun )) 82 for i in sf_synopsis: 83 sf_synopsis = i 84 coun += 1 85 # sum = sf_name + str(sf_download) + sf_size + sf_classify + sf_synopsis 86 # 保存数据 87 print(\'软件名:\',sf_name,\'\n\',\'下载数量:\',sf_download,\'亿人下载\',\'\n\',\'软件大小:\',sf_size,\'\n\',\'软件分类:\',sf_classify,\'\n\',\'简介:\',sf_synopsis) 88 print(\'————————————————————————————————————————————————\') 89 with open(\'sf_pop.csv\', "a", encoding=\'utf-8\') as file1: 90 file1.writelines(sf_name + "," + str(sf_download) + "," + sf_size + "," + sf_classify + "," + sf_classify + \'\n\') 91 # print(sum) 92 except Exception: 93 print(Exception) 94 95 # 游戏排行爬虫 96 def Wdj_game(): 97 # 创建文件 98 file = open("game_pop.csv", "a") 99 file.write( "game_name" + "," + "game_download" + "," + "game_size" + "," + "game_classify" + "," + "game_synopsis" + \'\n\') 100 file = file.close() 101 url = \'https://www.wandoujia.com/top/game\' 102 res = requests.get(url,headers=headers) 103 res.encoding = \'utf-8\' 104 # soup = BeautifulSoup(res.text,\'lxml\') 105 # print(soup) 106 html = etree.HTML(res.text) 107 coun = 1 108 for i in range(1,25): 109 try: 110 game_name = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/h2/a/text()".format( coun )) 111 for i in game_name: 112 game_name = i 113 game_download = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[1]/span[1]/text()".format( coun )) 114 for i in game_download: 115 game_download = i.strip(\'万亿人下载\') 116 game_download = float(game_download) 117 if game_download > 100: 118 game_download = game_download/10000 119 game_download = round(game_download, 2) 120 game_size = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[1]/span[3]/text()".format( coun )) 121 for i in game_size: 122 game_size = i 123 game_classify = html.xpath("//*[@id=\'j-top-list\']/li[{}]/a[1]/text()".format( coun )) 124 for i in game_classify: 125 game_classify = i 126 game_synopsis = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[2]/text()".format( coun )) 127 for i in game_synopsis: 128 game_synopsis = i 129 coun += 1 130 print(\'软件名:\', game_name, \'\n\', \'下载数量:\', game_download, \'亿人下载\', \'\n\', \'软件大小:\', game_size, \'\n\', \'软件分类:\',game_classify, \'\n\', \'简介:\', game_synopsis) 131 print(\'————————————————————————————————————————————————\') 132 with open(\'game_pop.csv\', "a", encoding=\'utf-8\') as file1: 133 file1.writelines(game_name + "," + str(game_download) + "," + game_size + "," + game_classify + "," + game_synopsis + \'\n\') 134 135 except Exception: 136 print(Exception) 137 138 if __name__ == \'__main__\': 139 print(\'—————————————————————————Start————————————————————————\') 140 print(\'软件爬虫:\') 141 Wdj_sf() 142 print(\'———————————————————————分界线——————————————————————————\') 143 print(\'游戏爬虫:\') 144 Wdj_game() 145 print(\'———————————————————————End————————————————————————————\')
数据清洗处理:
导入:
import pandas as pd import numpy as np # 下载数量的单位是亿人 sf = pd.read_csv(r\'C:\Users\10950\Desktop\LHX\sf_pop.csv\') game = pd.read_csv(r\'C:\Users\10950\Desktop\LHX\game_pop.csv\') sf.head(20)
重复值处理:
# 重复值处理 sf = sf.drop_duplicates() game = game.drop_duplicates()
数据可视化:
import matplotlib.pyplot as plt # sf数据可视化分析 x = sf[\'sf_name\'] y = sf[\'sf_download\'] plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 plt.plot(x,y,\'-\',color = \'c\',label="单位/亿") plt.xticks(rotation=90) plt.legend(loc = "best")#图例 plt.title("软件下载数量趋势图") plt.xlabel("软件名",)#横坐标名字 plt.ylabel("下载数量")#纵坐标名字 plt.show()
# game数据可视化分析 x = game[\'game_name\'] y = game[\'game_download\'] plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 plt.plot(x,y,\'-\',color = \'m\',label="单位/亿") plt.xticks(rotation=90) plt.legend(loc = "best")#图例 plt.title("游戏下载数量趋势图") plt.xlabel("游戏名",)#横坐标名字 plt.ylabel("下载数量")#纵坐标名字 plt.show()
# 柱状图 plt.bar(x,y,alpha=0.2, width=0.4, color=\'yellow\', edgecolor=\'red\', lw=3) plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 plt.title("软件下载数量柱状图") plt.xticks(rotation=90) plt.xlabel("软件名",)#横坐标名字 plt.ylabel("下载数量")#纵坐标名字 plt.show()
# 柱状图 plt.bar(x,y,alpha=0.2, width=0.4, color=\'w\', edgecolor=\'red\', lw=3) plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 plt.title("游戏下载数量柱状图") plt.xticks(rotation=90) plt.xlabel("游戏名",)#横坐标名字 plt.ylabel("下载数量")#纵坐标名字 plt.show()
# 水平图 plt.barh(x,y, alpha=0.2, height=0.4, color=\'r\', edgecolor=\'gray\',label=\'单位/亿\', lw=3) plt.title("软件下载数量水平图") plt.legend(loc = "best")#图例 plt.xlabel("下载数量",)#横坐标名字 plt.ylabel("软件名")#纵坐标名字 plt.show()
# 水平图 plt.barh(x,y, alpha=0.2, height=0.4, color=\'gray\', edgecolor=\'gray\',label=\'单位/亿\', lw=3) plt.title("游戏下载数量水平图") plt.legend(loc = "best")#图例 plt.xlabel("下载数量",)#横坐标名字 plt.ylabel("游戏名")#纵坐标名字 plt.show()
# 散点图 plt.scatter(x,y,color=\'b\',marker=\'o\',s=40,edgecolor=\'black\',alpha=0.5) plt.xticks(rotation=90) plt.title("软件下载数量散点图") plt.xlabel("软件名",)#横坐标名字 plt.ylabel("下载数量")#纵坐标名字 plt.show()
# 散点图
plt.scatter(x,y,color=\'w\',marker=\'o\',s=40,edgecolor=\'black\',alpha=0.5)
plt.xticks(rotation=90)
plt.title("软件下载数量散点图")
plt.xlabel("游戏名",)#横坐标名字
plt.ylabel("下载数量")#纵坐标名字
plt.show()
# 饼状图 label_list = x explode = (0,0,0,0.1,0,0) plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] plt.xticks(rotation=0) plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6) plt.title("软件下载数量饼状图") plt.axis("equal") plt.show()
# 饼状图 label_list = x explode = (0,0,0,0.1,0,0) plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] plt.xticks(rotation=0) plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6) plt.title("游戏下载数量饼状图") plt.axis("equal") plt.show()
词云:
import pandas as pd import numpy as np import wordcloud as wc import random import matplotlib.pyplot as plt sf = pd.read_csv(r\'C:\Users\10950\Desktop\LHX\sf_pop.csv\',encoding=\'gb2312\') game = pd.read_csv(r\'C:\Users\10950\Desktop\LHX\game_pop.csv\') # 软件词云 word_cloud = wc.WordCloud(font_path=\'msyh.ttc\') text = sf[\'sf_classify\'] sf = [] for i in text: sf.append(i) text = " ".join(sf) word_cloud.generate(text) plt.imshow(word_cloud) plt.show() #游戏词云 word_cloud = wc.WordCloud(font_path=\'msyh.ttc\') text1 = game[\'game_classify\'] game = [] for i in text1: game.append(i) text1 = " ".join(game) # text1 word_cloud.generate(text1) plt.imshow(word_cloud) plt.show()
总代码
1 import pandas as pd 2 import numpy as np 3 import matplotlib.pyplot as plt 4 5 # 下载数量的单位是亿人 6 sf = pd.read_csv(r\'C:\Users\10950\Desktop\LHX\sf_pop.csv\') 7 game = pd.read_csv(r\'C:\Users\10950\Desktop\LHX\game_pop.csv\') 8 sf.head(20) 9 # 重复值处理 10 sf = sf.drop_duplicates() 11 game = game.drop_duplicates() 12 13 # sf数据可视化分析 14 x = sf[\'sf_name\'] 15 y = sf[\'sf_download\'] 16 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 17 plt.plot(x,y,\'-\',color = \'c\',label="单位/亿") 18 plt.xticks(rotation=90) 19 plt.legend(loc = "best")#图例 20 plt.title("软件下载数量趋势图") 21 plt.xlabel("软件名",)#横坐标名字 22 plt.ylabel("下载数量")#纵坐标名字 23 plt.show() 24 25 # game数据可视化分析 26 x = game[\'game_name\'] 27 y = game[\'game_download\'] 28 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 29 plt.plot(x,y,\'-\',color = \'m\',label="单位/亿") 30 plt.xticks(rotation=90) 31 plt.legend(loc = "best")#图例 32 plt.title("游戏下载数量趋势图") 33 plt.xlabel("游戏名",)#横坐标名字 34 plt.ylabel("下载数量")#纵坐标名字 35 plt.show() 36 37 # 柱状图 38 plt.bar(x,y,alpha=0.2, width=0.4, color=\'yellow\', edgecolor=\'red\', lw=3) 39 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 40 plt.title("游戏软件数量柱状图") 41 plt.xticks(rotation=90) 42 plt.xlabel("软件名",)#横坐标名字 43 plt.ylabel("下载数量")#纵坐标名字 44 plt.show() 45 46 # 柱状图 47 plt.bar(x,y,alpha=0.2, width=0.4, color=\'w\', edgecolor=\'red\', lw=3) 48 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签 49 plt.title("游戏下载数量柱状图") 50 plt.xticks(rotation=90) 51 plt.xlabel("游戏名",)#横坐标名字 52 plt.ylabel("下载数量")#纵坐标名字 53 plt.show() 54 55 # 水平图 56 plt.barh(x,y, alpha=0.2, height=0.4, color=\'r\', edgecolor=\'gray\',label=\'单位/亿\', lw=3) 57 plt.title("软件下载数量水平图") 58 plt.legend(loc = "best")#图例 59 plt.xlabel("下载数量",)#横坐标名字 60 plt.ylabel("软件名")#纵坐标名字 61 plt.show() 62 63 # 水平图 64 plt.barh(x,y, alpha=0.2, height=0.4, color=\'gray\', edgecolor=\'gray\',label=\'单位/亿\', lw=3) 65 plt.title("游戏下载数量水平图") 66 plt.legend(loc = "best")#图例 67 plt.xlabel("下载数量",)#横坐标名字 68 plt.ylabel("游戏名")#纵坐标名字 69 plt.show() 70 71 # 散点图 72 plt.scatter(x,y,color=\'b\',marker=\'o\',s=40,edgecolor=\'black\',alpha=0.5) 73 plt.xticks(rotation=90) 74 plt.title("软件下载数量散点图") 75 plt.xlabel("软件名",)#横坐标名字 76 plt.ylabel("下载数量")#纵坐标名字 77 plt.show() 78 79 # 散点图 80 plt.scatter(x,y,color=\'w\',marker=\'o\',s=40,edgecolor=\'black\',alpha=0.5) 81 plt.xticks(rotation=90) 82 plt.title("软件下载数量散点图") 83 plt.xlabel("游戏名",)#横坐标名字 84 plt.ylabel("下载数量")#纵坐标名字 85 plt.show() 86 87 # 饼状图 88 label_list = x 89 explode = (0,0,0,0.1,0,0) 90 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] 91 plt.xticks(rotation=0) 92 plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6) 93 plt.title("软件下载数量饼状图") 94 plt.axis("equal") 95 plt.show() 96 97 # 饼状图 98 label_list = x 99 explode = (0,0,0,0.1,0,0) 100 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] 101 plt.xticks(rotation=0) 102 plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6) 103 plt.title("游戏下载数量饼状图") 104 plt.axis("equal") 105 plt.show()
import pandas as pd
import numpy as np
import wordcloud as wc
import random
import matplotlib.pyplot as plt
sf = pd.read_csv(r\'C:\Users\lhx\Desktop\LHX\sf_pop.csv\',encoding=\'gb2312\')
game = pd.read_csv(r\'C:\Users\lhx\Desktop\LHX\game_pop.csv\')
# 词云
word_cloud = wc.WordCloud(font_path=\'msyh.ttc\')
text = sf[\'sf_classify\']
sf = []
for i in text:
sf.append(i)
text = " ".join(sf)
word_cloud.generate(text)
plt.imshow(word_cloud)
plt.show()
word_cloud = wc.WordCloud(font_path=\'msyh.ttc\')
text1 = game[\'game_classify\']
game = []
for i in text1:
game.append(i)
text1 = " ".join(game)
# text1
word_cloud.generate(text1)
plt.imshow(word_cloud)
plt.show()
五、总结
从可视化结果分析来看,软件排行QQ和微信下载量最受欢迎,游戏排行看王者荣耀和4399游戏盒受欢迎。分析结果达到预期效果。在设计过程中,我收获了数据处理的编程思维方式。不足之处在绘制散点图时,没有达到自己的效果,通过此次实验后对可视化分析有了更深的认知,并对绘图继续专研。