lpyyds

一、选题背景

  小说这类文化产物可以说从小陪伴我们长大,还记得晚上高中在宿舍看小说第二天打哈欠。如何了解一本新出小说的热度呢?可以从一些小说平台看排行数据,就可以看出小说的热度如何了。为此我选此题,是进行小说的周热度分析,以及从目前更新字数总量可以判断出小说更新了多少。

二、网络爬虫设计方案

名称:飞卢小说周阅读热度数据爬取

内容:通过爬虫的三段式进行爬取数据,最后通过sys来保存数据。

思路:首先request请求网页,用etree进行网页解析。然后使用etree.xpath进行数据筛选。用for循环进行网页的翻页,最后sys库进行数据操作与保存。

难点:网站的翻页设置,以及数据的筛出。

三、结构特征分析

结构:内容导航型

 

Htmls页面解析:

小说名:

 

分类:

 

 

 周点击、字数量:

 

 

 简介:

 

 

 

节点查找、遍历:

查找:通过xpath找到标签位置。

遍历:使用for循环+计数方案进行遍历出数据。

四、网络爬虫程序设计

数据爬取与采集:

 1 import  requests
 2 from bs4 import BeautifulSoup
 3 import time
 4 import random
 5 import sys
 6 import re
 7 from tqdm import tqdm
 8 from lxml import etree
 9 
10 
11 USER_AGENTS = [
12     \'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36\'
13     \'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36\'
14     \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36\'
15     \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36\'
16     \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36\'
17     \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36\'
18     \'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36\'
19     \'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36\'
20     \'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36\'
21     \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36\'
22     \'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36\'
23     \'Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3\'
24     \'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0\'
25     \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0\'
26     \'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0\'
27     \'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0\'
28     \'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0\'
29     \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0\'
30     \'Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0\'
31     \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0\'
32     \'Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0\'
33     \'Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0\'
34     \'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1\'
35     \'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1\'
36     \'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0\'
37     \'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0\'
38 ]
39 
40 headers = {
41     \'User-Agent\':random.choice(USER_AGENTS),
42     # \'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0\',
43     \'Connection\':\'keep-alive\',
44     \'Accept-Language\':\'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2\'
45     }
46 
47 def Fl(page):
48     # 创建文件
49     file = open("Fl_pop.csv", "a")
50     file.write( "book_name" + "," + "book_class" + "," + "book_hits" + "," + "book_word" + "," + "book_info" + \'\n\')
51     file = file.close()
52     for i in range(0,page):
53         url = \'https://b.faloo.com/l_0_0_0_0_0_1_\'+str(page)+\'.html\'
54         res = requests.get(url,headers=headers)
55         res.encoding = \'gb2312\'
56         html = etree.HTML(res.text)
57         # print(html)
58         # print(res)
59         # 计数初始化
60         coun1 = 1
61         coun2 = 1
62         #书名book_name、书分类book_class、书点击数book_hits、书字数book_word、书简介book_info
63         for i in range(1,16):
64             try:
65                 book_name= html.xpath("//*[@id=\'BookContent\']/div[{}]/div[{}]/div[2]/div[1]/div[1]/h1/a/text()".format(coun1,coun2))
66                 for i in book_name:
67                     book_name = i
68                 book_class = html.xpath("//*[@id=\'BookContent\']/div[{}]/div[{}]/div[2]/div[2]/span/a/text()".format(coun1,coun2))
69                 for i in book_class:
70                     book_class = i
71                 book_hits = html.xpath("//*[@id=\'BookContent\']/div[{}]/div[{}]/div[2]/div[2]/span/span[2]/text()".format(coun1,coun2))
72                 for i in book_hits:
73                     book_hits = i.strip(\'周点击:\')
74                     book_hits = book_hits.strip(\'\')
75                 book_word = html.xpath("//*[@id=\'BookContent\']/div[{}]/div[{}]/div[2]/div[2]/span/span[4]/text()".format(coun1,coun2))
76                 for i in book_word:
77                     book_word = i.strip(\'字数:\')
78                     book_word = book_word.strip(\'\')
79                 book_info = html.xpath("//*[@id=\'BookContent\']/div[{}]/div[{}]/div[2]/div[3]/a/text()".format(coun1,coun2))
80                 for i in book_info:
81                     book_info = i
82                 # 保存文件
83                 with open(\'Fl_pop.csv\', "a", encoding=\'utf-8\') as file1:
84                     file1.writelines(book_name + "," + book_class + "," + book_hits + "," + book_word + "," + book_info + \'\n\')
85                 # 计数处理
86                 coun1 += 1
87                 coun2 += 1
88                 if coun2>2:
89                     coun2 = 1
90                 print(book_name,\'\n\',book_class,\'\n\',\'周点击数:\',book_hits,\'\',\'\n\',\'字数:\',book_word,\'\',\'\n\',\'简介:\n\',book_info,\'\n\')
91             except:
92                 pass
93         page+=1
94 
95 if __name__ == \'__main__\':
96     # 一页30张数据样本
97     # page = input("爬取几页:")
98     page =20
99     Fl(page)

运行截图:

 

 

 

 

数据清洗和处理:

import pandas as pd
import numpy as np

Fl =  pd.read_csv(r\'C:\Users\LP\Desktop\LP\Fl_pop.csv\',error_bad_lines=False)
Fl.head(20)

 

 

 

# 重复值处理
Fl = Fl.drop_duplicates()
# Nan处理
Fl = Fl.dropna(axis = 0)
# 删除无效行
del Fl[\'book_info\']
import matplotlib.pyplot as plt
# 可视化分析
# y的点击数单位为万
x = Fl[\'book_name\'].head(20)
y = Fl[\'book_hits\'].head(20)
z = Fl[\'book_word\'].head(20)
plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
plt.rcParams[\'axes.unicode_minus\']=False
plt.plot(x,y,\'-.\',color = \'c\',label="点击量 单位/万")
plt.xticks(rotation=90)
plt.legend(loc = "best")#图例
plt.title("飞卢小说周点击量趋势图")
plt.xlabel("书名名",)#横坐标名字
plt.ylabel("点击数")#纵坐标名字
plt.show()

 

plt.plot(x,z,\'-.\',color = \'r\',label="字数 单位/万")
plt.xticks(rotation=90)
plt.legend(loc = "best")#图例
plt.xlabel("书名名",)#横坐标名字
plt.ylabel("点击数")#纵坐标名字
plt.title("飞卢小说字数趋势图")
plt.show()

 

 

 

# 柱状图
plt.bar(x,y,alpha=0.2, width=0.4, color=\'w\', edgecolor=\'red\', lw=3)
plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
plt.title("飞卢小说周点击量柱状图")
plt.xticks(rotation=90)
plt.xlabel("小说名",)#横坐标名字
plt.ylabel("点击量")#纵坐标名字
plt.show()

 

# 柱状图
plt.bar(x,z,alpha=0.2, width=0.4, color=\'g\', edgecolor=\'red\', lw=3)
plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
plt.title("飞卢小说字数柱状图")
plt.xticks(rotation=90)
plt.xlabel("小说名",)#横坐标名字
plt.ylabel("字数")#纵坐标名字
plt.show()

 

# 水平图
plt.barh(x,y, alpha=0.2, height=0.4, color=\'b\', edgecolor=\'gray\',label="字数 单位/万", lw=3)
plt.title("飞卢小说周点击量水平图")
plt.legend(loc = "best")#图例
plt.xlabel("点击量",)#横坐标名字
plt.ylabel("小说名")#纵坐标名字
plt.show()

 

 

 

 

 

# 水平图
plt.barh(x,z, alpha=0.2, height=0.4, color=\'g\', edgecolor=\'gray\',label="字数 单位/万", lw=3)
plt.title("飞卢小说字数水平图")
plt.legend(loc = "best")#图例
plt.xlabel("点击量",)#横坐标名字
plt.ylabel("小说名")#纵坐标名字
plt.show()

 

 

 

 

 

 

# 散点图
plt.scatter(x,y,color=\'w\',marker=\'o\',s=40,edgecolor=\'black\',alpha=0.5)
plt.xticks(rotation=90)
plt.title("飞卢小说周点击量散点图")
plt.xlabel("小说名",)#横坐标名字
plt.ylabel("点击量")#纵坐标名字
plt.show()

 

# 盒图
plt.boxplot(y,  #
            vert=True,  # true:纵向,false:横向
            showmeans=True)  # 显示均值
plt.title("飞卢小说周点击量盒图")
plt.show()

云词:

 

 1 import pandas as pd
 2 import numpy as np
 3 import wordcloud as wc
 4 import random 
 5 import matplotlib.pyplot as plt
 6 
 7 Fl =  pd.read_csv(r\'C:\Users\10950\Desktop\LP\Fl_pop.csv\',error_bad_lines=False)
 8 word_cloud = wc.WordCloud(width=500,  # 词云图宽
 9                        height=500,  # 词云图高
10                        background_color=\'white\',  # 词云图背景颜色,默认为白色
11                        font_path=\'msyhbd.ttc\',  # 词云图 字体(中文需要设定为本机有的中文字体)
12                        max_font_size=400,  # 最大字体,默认为200
13                        random_state=50,  # 为每个单词返回一个PIL颜色
14                        )
15 text = Fl[\'book_class\']
16 Fl = []
17 for i in text:
18     Fl.append(i)
19 text = " ".join(Fl)
20 
21 word_cloud.generate(text)
22 plt.imshow(word_cloud)
23 plt.show()

 

 

 

 

 

总代码:

 1 import pandas as pd
 2 import numpy as np
 3 
 4 Fl =  pd.read_csv(r\'C:\Users\LP\Desktop\LP\Fl_pop.csv\',error_bad_lines=False)
 5 Fl.head(20)
 6 
 7 # 重复值处理
 8 Fl = Fl.drop_duplicates()
 9 # Nan处理
10 Fl = Fl.dropna(axis = 0)
11 # 删除无效行
12 del Fl[\'book_info\']
13 
14 import matplotlib.pyplot as plt
15 # 可视化分析
16 # y的点击数单位为万
17 x = Fl[\'book_name\'].head(20)
18 y = Fl[\'book_hits\'].head(20)
19 z = Fl[\'book_word\'].head(20)
20 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
21 plt.rcParams[\'axes.unicode_minus\']=False
22 plt.plot(x,y,\'-.\',color = \'c\',label="点击量 单位/万")
23 plt.xticks(rotation=90)
24 plt.legend(loc = "best")#图例
25 plt.title("飞卢小说周点击量趋势图")
26 plt.xlabel("书名名",)#横坐标名字
27 plt.ylabel("点击数")#纵坐标名字
28 plt.show()
29 
30 plt.plot(x,z,\'-.\',color = \'r\',label="字数 单位/万")
31 plt.xticks(rotation=90)
32 plt.legend(loc = "best")#图例
33 plt.xlabel("书名名",)#横坐标名字
34 plt.ylabel("点击数")#纵坐标名字
35 plt.title("飞卢小说字数趋势图")
36 plt.show()
37 
38 # 柱状图
39 plt.bar(x,y,alpha=0.2, width=0.4, color=\'w\', edgecolor=\'red\', lw=3)
40 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
41 plt.title("飞卢小说周点击量柱状图")
42 plt.xticks(rotation=90)
43 plt.xlabel("小说名",)#横坐标名字
44 plt.ylabel("点击量")#纵坐标名字
45 plt.show()
46 
47 # 柱状图
48 plt.bar(x,z,alpha=0.2, width=0.4, color=\'g\', edgecolor=\'red\', lw=3)
49 plt.rcParams[\'font.sans-serif\']=[\'SimHei\'] #用来正常显示中文标签
50 plt.title("飞卢小说字数柱状图")
51 plt.xticks(rotation=90)
52 plt.xlabel("小说名",)#横坐标名字
53 plt.ylabel("字数")#纵坐标名字
54 plt.show()
55 
56 # 水平图
57 plt.barh(x,y, alpha=0.2, height=0.4, color=\'b\', edgecolor=\'gray\',label=\'单位/亿\', lw=3)
58 plt.title("飞卢小说周点击量水平图")
59 plt.legend(loc = "best")#图例
60 plt.xlabel("点击量",)#横坐标名字
61 plt.ylabel("小说名")#纵坐标名字
62 plt.show()
63 
64 # 水平图
65 plt.barh(x,z, alpha=0.2, height=0.4, color=\'g\', edgecolor=\'gray\',label=\'单位/亿\', lw=3)
66 plt.title("飞卢小说字数水平图")
67 plt.legend(loc = "best")#图例
68 plt.xlabel("点击量",)#横坐标名字
69 plt.ylabel("小说名")#纵坐标名字
70 plt.show()
71 
72 # 散点图
73 plt.scatter(x,y,color=\'w\',marker=\'o\',s=40,edgecolor=\'black\',alpha=0.5)
74 plt.xticks(rotation=90)
75 plt.title("飞卢小说周点击量散点图")
76 plt.xlabel("小说名",)#横坐标名字
77 plt.ylabel("点击量")#纵坐标名字
78 plt.show()
79 
80 # 盒图
81 plt.boxplot(y,  #
82             vert=True,  # true:纵向,false:横向
83             showmeans=True)  # 显示均值
84 plt.title("飞卢小说周点击量盒图")
85 plt.show()
import pandas as pd
import numpy as np
import wordcloud as wc
import random
import matplotlib.pyplot as plt

Fl =  pd.read_csv(r\'C:\Users\10950\Desktop\LP\Fl_pop.csv\',error_bad_lines=False)
word_cloud = wc.WordCloud(width=500,  # 词云图宽
                       height=500,  # 词云图高
                       background_color=\'white\',  # 词云图背景颜色,默认为白色
                       font_path=\'msyhbd.ttc\',  # 词云图 字体(中文需要设定为本机有的中文字体)
                       max_font_size=400,  # 最大字体,默认为200
                       random_state=50,  # 为每个单词返回一个PIL颜色
                       )
text = Fl[\'book_class\']
Fl = []
for i in text:
    Fl.append(i)
text = " ".join(Fl)

word_cloud.generate(text)
plt.imshow(word_cloud)
plt.show()

五、总结

  经过此次可视化分析,可以得出周点击数的情况,以及根据字体数量已经更新到什么程度,例如《武侠神奇皮肤系统》字数>500万。达到预期目标!在此次设计过程中我收获到了如何编写爬虫程序,虽然写的很吃力,有些实现的功能不懂还需要去查看解决方案,当写完项目的时候满满的成就感。不足之处在于代码经验太少了,在暑假时加强代码强化。

 

分类:

技术点:

相关文章: