豌豆荚排行榜数据可视化分析

一、选题背景

　　现如今，手机APP品类繁多。不同种类的软件可以实现不同的功能。为此，根据豌豆荚应用市场排行榜做出分析，通过下载量来分析出来受欢迎的软件。

二、爬虫设计方案

爬虫名称：豌豆荚排行榜爬虫

内容与数据特征分析：通过获取网页界面获取想要的数据。

设计方案：request，etree进行网页分析，xpath获取想要爬取的数据，sys进行文件保存。

技术难点：xpath格式转换。

三、主题页面结构特征分析

页面的结构特征分析：内容导航型

Htmls页面分析：

标题、简介：

下载数量：

软件大小、分类：

节点查找、遍历：

查找：

sf_name = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/h2/a/text()".format( coun ))
sf_download = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[1]/span[1]/text()".format( coun ))
sf_size = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[1]/span[3]/text()".format( coun ))
sf_classify = html.xpath("//*[@id=\'j-top-list\']/li[{}]/a[1]/text()".format( coun ))
sf_synopsis = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[2]/text()".format( coun ))

遍历：通过循环进行提取

四、网络爬虫程序设计

数据爬取与采集：

  1 import  requests
  2 from bs4 import BeautifulSoup
  3 import time
  4 import random
  5 import sys
  6 import re
  7 from tqdm import tqdm
  8 from lxml import etree
  9 
 10 USER_AGENTS = [
 11     \'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36\'
 12     \'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36\'
 13     \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36\'
 14     \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36\'
 15     \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36\'
 16     \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36\'
 17     \'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36\'
 18     \'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36\'
 19     \'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36\'
 20     \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36\'
 21     \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36\'
 22     \'Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3\'
 23     \'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0\'
 24     \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0\'
 25     \'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0\'
 26     \'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0\'
 27     \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0\'
 28     \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0\'
 29     \'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0\'
 30     \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0\'
 31     \'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0\'
 32     \'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0\'
 33     \'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1\'
 34     \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1\'
 35     \'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0\'
 36     \'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0\'
 37 ]
 38 
 39 headers = {
 40     \'User-Agent\':random.choice(USER_AGENTS),
 41     # \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0\',
 42     \'Connection\':\'keep-alive\',
 43     \'Accept-Language\':\'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2\'
 44     }
 45 
 46 
 47 # 软件排行爬虫
 48 def Wdj_sf():
 49     # 创建文件
 50     file = open("sf_pop.csv", "a")
 51     file.write( "sf_name" + "," + "sf_download" + "," + "sf_size" + "," + "sf_classify" + "," + "sf_synopsis" + \'\n\')
 52     file = file.close()
 53     # 请求访问
 54     url = \'https://www.wandoujia.com/top/app\'
 55     res = requests.get(url,headers=headers)
 56     res.encoding = \'utf-8\'
 57     # soup = BeautifulSoup(res.text,\'lxml\')
 58     # print(soup)
 59     html = etree.HTML(res.text)
 60     # print(html)
 61     #获取标签内容：软件名sf_name、下次次数sf_download、软件大小sf_size、软件分类sf_classify、软件简介sf_synopsis
 62     coun = 1
 63     for i in range(1,25):
 64         try:
 65             sf_name = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/h2/a/text()".format( coun ))
 66             for i in sf_name:
 67                 sf_name = i
 68             sf_download = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[1]/span[1]/text()".format( coun ))
 69             for i in sf_download:
 70                 sf_download = i.strip(\'万亿人下载\')
 71                 sf_download = float(sf_download)
 72                 if sf_download > 100:
 73                     sf_download = sf_download/10000
 74                     sf_download = round(sf_download, 2)
 75             sf_size = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[1]/span[3]/text()".format( coun ))
 76             for i in sf_size:
 77                 sf_size = i
 78             sf_classify = html.xpath("//*[@id=\'j-top-list\']/li[{}]/a[1]/text()".format( coun ))
 79             for i in sf_classify:
 80                 sf_classify = i
 81             sf_synopsis = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[2]/text()".format( coun ))
 82             for i in sf_synopsis:
 83                 sf_synopsis = i
 84             coun += 1
 85             # sum = sf_name + str(sf_download) + sf_size + sf_classify + sf_synopsis
 86             # 保存数据
 87             print(\'软件名：\',sf_name,\'\n\',\'下载数量：\',sf_download,\'亿人下载\',\'\n\',\'软件大小：\',sf_size,\'\n\',\'软件分类：\',sf_classify,\'\n\',\'简介：\',sf_synopsis)
 88             print(\'————————————————————————————————————————————————\')
 89             with open(\'sf_pop.csv\', "a", encoding=\'utf-8\') as file1:
 90                 file1.writelines(sf_name + "," + str(sf_download) + "," + sf_size + "," + sf_classify + "," + sf_classify + \'\n\')
 91             # print(sum)
 92         except  Exception:
 93             print(Exception)
 94 
 95 # 游戏排行爬虫
 96 def Wdj_game():
 97     # 创建文件
 98     file = open("game_pop.csv", "a")
 99     file.write( "game_name" + "," + "game_download" + "," + "game_size" + "," + "game_classify" + "," + "game_synopsis" + \'\n\')
100     file = file.close()
101     url = \'https://www.wandoujia.com/top/game\'
102     res = requests.get(url,headers=headers)
103     res.encoding = \'utf-8\'
104     # soup = BeautifulSoup(res.text,\'lxml\')
105     # print(soup)
106     html = etree.HTML(res.text)
107     coun = 1
108     for i in range(1,25):
109         try:
110             game_name = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/h2/a/text()".format( coun ))
111             for i in game_name:
112                 game_name = i
113             game_download = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[1]/span[1]/text()".format( coun ))
114             for i in game_download:
115                 game_download = i.strip(\'万亿人下载\')
116                 game_download = float(game_download)
117                 if game_download > 100:
118                     game_download = game_download/10000
119                     game_download = round(game_download, 2)
120             game_size = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[1]/span[3]/text()".format( coun ))
121             for i in game_size:
122                 game_size = i
123             game_classify = html.xpath("//*[@id=\'j-top-list\']/li[{}]/a[1]/text()".format( coun ))
124             for i in game_classify:
125                 game_classify = i
126             game_synopsis = html.xpath("//*[@id=\'j-top-list\']/li[{}]/div[2]/div[2]/text()".format( coun ))
127             for i in game_synopsis:
128                 game_synopsis = i
129             coun += 1
130             print(\'软件名：\', game_name, \'\n\', \'下载数量：\', game_download, \'亿人下载\', \'\n\', \'软件大小：\', game_size, \'\n\', \'软件分类：\',game_classify, \'\n\', \'简介：\', game_synopsis)
131             print(\'————————————————————————————————————————————————\')
132             with open(\'game_pop.csv\', "a", encoding=\'utf-8\') as file1:
133                 file1.writelines(game_name + "," + str(game_download) + "," + game_size + "," + game_classify + "," + game_synopsis + \'\n\')
134  
135         except  Exception:
136             print(Exception)
137 
138 if __name__ == \'__main__\':
139     print(\'—————————————————————————Start————————————————————————\')
140     print(\'软件爬虫：\')
141     Wdj_sf()
142     print(\'———————————————————————分界线——————————————————————————\')
143     print(\'游戏爬虫：\')
144     Wdj_game()
145     print(\'———————————————————————End————————————————————————————\')

数据清洗处理：

导入：

import pandas as pd
import numpy as np

# 下载数量的单位是亿人
sf = pd.read_csv(r\'C:\Users\10950\Desktop\LHX\sf_pop.csv\')
game = pd.read_csv(r\'C:\Users\10950\Desktop\LHX\game_pop.csv\')
sf.head(20)

重复值处理：

# 重复值处理
sf = sf.drop_duplicates()
game = game.drop_duplicates()

数据可视化：

import matplotlib.pyplot as plt
# sf数据可视化分析
x = sf[\'sf_name\']
y = sf[\'sf_download\']
plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
plt.plot(x,y,\'-\',color = \'c\',label="单位/亿")
plt.xticks(rotation=90)
plt.legend(loc = "best")#图例
plt.title("软件下载数量趋势图")
plt.xlabel("软件名",)#横坐标名字
plt.ylabel("下载数量")#纵坐标名字
plt.show()

# game数据可视化分析
x = game[\'game_name\']
y = game[\'game_download\']
plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
plt.plot(x,y,\'-\',color = \'m\',label="单位/亿")
plt.xticks(rotation=90)
plt.legend(loc = "best")#图例
plt.title("游戏下载数量趋势图")
plt.xlabel("游戏名",)#横坐标名字
plt.ylabel("下载数量")#纵坐标名字
plt.show()

# 柱状图
plt.bar(x,y,alpha=0.2, width=0.4, color=\'yellow\', edgecolor=\'red\', lw=3)
plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
plt.title("软件下载数量柱状图")
plt.xticks(rotation=90)
plt.xlabel("软件名",)#横坐标名字
plt.ylabel("下载数量")#纵坐标名字
plt.show()

# 柱状图
plt.bar(x,y,alpha=0.2, width=0.4, color=\'w\', edgecolor=\'red\', lw=3)
plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
plt.title("游戏下载数量柱状图")
plt.xticks(rotation=90)
plt.xlabel("游戏名",)#横坐标名字
plt.ylabel("下载数量")#纵坐标名字
plt.show()

# 水平图
plt.barh(x,y, alpha=0.2, height=0.4, color=\'r\', edgecolor=\'gray\',label=\'单位/亿\', lw=3)
plt.title("软件下载数量水平图")
plt.legend(loc = "best")#图例
plt.xlabel("下载数量",)#横坐标名字
plt.ylabel("软件名")#纵坐标名字
plt.show()

# 水平图
plt.barh(x,y, alpha=0.2, height=0.4, color=\'gray\', edgecolor=\'gray\',label=\'单位/亿\', lw=3)
plt.title("游戏下载数量水平图")
plt.legend(loc = "best")#图例
plt.xlabel("下载数量",)#横坐标名字
plt.ylabel("游戏名")#纵坐标名字
plt.show()

# 散点图
plt.scatter(x,y,color=\'b\',marker=\'o\',s=40,edgecolor=\'black\',alpha=0.5)
plt.xticks(rotation=90)
plt.title("软件下载数量散点图")
plt.xlabel("软件名",)#横坐标名字
plt.ylabel("下载数量")#纵坐标名字
plt.show()

# 散点图
plt.scatter(x,y,color=\'w\',marker=\'o\',s=40,edgecolor=\'black\',alpha=0.5)
plt.xticks(rotation=90)
plt.title("软件下载数量散点图")
plt.xlabel("游戏名",)#横坐标名字
plt.ylabel("下载数量")#纵坐标名字
plt.show()

# 饼状图
label_list = x
explode = (0,0,0,0.1,0,0)
plt.rcParams[\'font.sans-serif\']=[\'SimHei\']
plt.xticks(rotation=0)
plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6)
plt.title("软件下载数量饼状图")
plt.axis("equal")
plt.show()

# 饼状图
label_list = x
explode = (0,0,0,0.1,0,0)
plt.rcParams[\'font.sans-serif\']=[\'SimHei\']
plt.xticks(rotation=0)
plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6)
plt.title("游戏下载数量饼状图")
plt.axis("equal")
plt.show()

词云：

import pandas as pd
import numpy as np
import wordcloud as wc
import random 
import matplotlib.pyplot as plt


sf = pd.read_csv(r\'C:\Users\10950\Desktop\LHX\sf_pop.csv\',encoding=\'gb2312\')
game = pd.read_csv(r\'C:\Users\10950\Desktop\LHX\game_pop.csv\')
# 软件词云
word_cloud = wc.WordCloud(font_path=\'msyh.ttc\')
text = sf[\'sf_classify\']
sf = []
for i in text:
    sf.append(i)
text = " ".join(sf)
word_cloud.generate(text)
plt.imshow(word_cloud)
plt.show()
#游戏词云
word_cloud = wc.WordCloud(font_path=\'msyh.ttc\')
text1 = game[\'game_classify\']
game = []
for i in text1:
    game.append(i)
text1 = " ".join(game)
# text1
word_cloud.generate(text1)
plt.imshow(word_cloud)
plt.show()

总代码

  1 import pandas as pd
  2 import numpy as np
  3 import matplotlib.pyplot as plt
  4 
  5 # 下载数量的单位是亿人
  6 sf = pd.read_csv(r\'C:\Users\10950\Desktop\LHX\sf_pop.csv\')
  7 game = pd.read_csv(r\'C:\Users\10950\Desktop\LHX\game_pop.csv\')
  8 sf.head(20)
  9 # 重复值处理
 10 sf = sf.drop_duplicates()
 11 game = game.drop_duplicates()
 12 
 13 # sf数据可视化分析
 14 x = sf[\'sf_name\']
 15 y = sf[\'sf_download\']
 16 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
 17 plt.plot(x,y,\'-\',color = \'c\',label="单位/亿")
 18 plt.xticks(rotation=90)
 19 plt.legend(loc = "best")#图例
 20 plt.title("软件下载数量趋势图")
 21 plt.xlabel("软件名",)#横坐标名字
 22 plt.ylabel("下载数量")#纵坐标名字
 23 plt.show()
 24 
 25 # game数据可视化分析
 26 x = game[\'game_name\']
 27 y = game[\'game_download\']
 28 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
 29 plt.plot(x,y,\'-\',color = \'m\',label="单位/亿")
 30 plt.xticks(rotation=90)
 31 plt.legend(loc = "best")#图例
 32 plt.title("游戏下载数量趋势图")
 33 plt.xlabel("游戏名",)#横坐标名字
 34 plt.ylabel("下载数量")#纵坐标名字
 35 plt.show()
 36 
 37 # 柱状图
 38 plt.bar(x,y,alpha=0.2, width=0.4, color=\'yellow\', edgecolor=\'red\', lw=3)
 39 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
 40 plt.title("游戏软件数量柱状图")
 41 plt.xticks(rotation=90)
 42 plt.xlabel("软件名",)#横坐标名字
 43 plt.ylabel("下载数量")#纵坐标名字
 44 plt.show()
 45 
 46 # 柱状图
 47 plt.bar(x,y,alpha=0.2, width=0.4, color=\'w\', edgecolor=\'red\', lw=3)
 48 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
 49 plt.title("游戏下载数量柱状图")
 50 plt.xticks(rotation=90)
 51 plt.xlabel("游戏名",)#横坐标名字
 52 plt.ylabel("下载数量")#纵坐标名字
 53 plt.show()
 54 
 55 # 水平图
 56 plt.barh(x,y, alpha=0.2, height=0.4, color=\'r\', edgecolor=\'gray\',label=\'单位/亿\', lw=3)
 57 plt.title("软件下载数量水平图")
 58 plt.legend(loc = "best")#图例
 59 plt.xlabel("下载数量",)#横坐标名字
 60 plt.ylabel("软件名")#纵坐标名字
 61 plt.show()
 62 
 63 # 水平图
 64 plt.barh(x,y, alpha=0.2, height=0.4, color=\'gray\', edgecolor=\'gray\',label=\'单位/亿\', lw=3)
 65 plt.title("游戏下载数量水平图")
 66 plt.legend(loc = "best")#图例
 67 plt.xlabel("下载数量",)#横坐标名字
 68 plt.ylabel("游戏名")#纵坐标名字
 69 plt.show()
 70 
 71 # 散点图
 72 plt.scatter(x,y,color=\'b\',marker=\'o\',s=40,edgecolor=\'black\',alpha=0.5)
 73 plt.xticks(rotation=90)
 74 plt.title("软件下载数量散点图")
 75 plt.xlabel("软件名",)#横坐标名字
 76 plt.ylabel("下载数量")#纵坐标名字
 77 plt.show()
 78 
 79 # 散点图
 80 plt.scatter(x,y,color=\'w\',marker=\'o\',s=40,edgecolor=\'black\',alpha=0.5)
 81 plt.xticks(rotation=90)
 82 plt.title("软件下载数量散点图")
 83 plt.xlabel("游戏名",)#横坐标名字
 84 plt.ylabel("下载数量")#纵坐标名字
 85 plt.show()
 86 
 87 # 饼状图
 88 label_list = x
 89 explode = (0,0,0,0.1,0,0)
 90 plt.rcParams[\'font.sans-serif\']=[\'SimHei\']
 91 plt.xticks(rotation=0)
 92 plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6)
 93 plt.title("软件下载数量饼状图")
 94 plt.axis("equal")
 95 plt.show()
 96 
 97 # 饼状图
 98 label_list = x
 99 explode = (0,0,0,0.1,0,0)
100 plt.rcParams[\'font.sans-serif\']=[\'SimHei\']
101 plt.xticks(rotation=0)
102 plt.pie(y,labels=label_list,labeldistance=1.1, autopct="%1.1f%%", shadow=False, startangle=90, pctdistance=0.6)
103 plt.title("游戏下载数量饼状图")
104 plt.axis("equal")
105 plt.show()

import pandas as pd
import numpy as np
import wordcloud as wc
import random
import matplotlib.pyplot as plt

sf = pd.read_csv(r\'C:\Users\lhx\Desktop\LHX\sf_pop.csv\',encoding=\'gb2312\')
game = pd.read_csv(r\'C:\Users\lhx\Desktop\LHX\game_pop.csv\')
# 词云
word_cloud = wc.WordCloud(font_path=\'msyh.ttc\')
text = sf[\'sf_classify\']
sf = []
for i in text:
sf.append(i)
text = " ".join(sf)

word_cloud.generate(text)
plt.imshow(word_cloud)
plt.show()

word_cloud = wc.WordCloud(font_path=\'msyh.ttc\')
text1 = game[\'game_classify\']
game = []
for i in text1:
game.append(i)
text1 = " ".join(game)
# text1

word_cloud.generate(text1)
plt.imshow(word_cloud)
plt.show()

五、总结

　　从可视化结果分析来看，软件排行QQ和微信下载量最受欢迎，游戏排行看王者荣耀和4399游戏盒受欢迎。分析结果达到预期效果。在设计过程中，我收获了数据处理的编程思维方式。不足之处在绘制散点图时，没有达到自己的效果，通过此次实验后对可视化分析有了更深的认知，并对绘图继续专研。