过年了,爬爬看python需要什么技能才能有备无患。
大体思路:
- 爬所需信息
- 爬一爬详情页做个可视化词云,看看所需节能
- 做一做数据可视化
所需库:
- csv,保存数据用的
- selenium,模拟真人访问网站,因为requests很容易被反爬
- parsel和requests,可以尝试爬详情页
- random,随机休眠用的
- time,时间模块儿
翻页爬取python岗位信息代码:
1 from selenium import webdriver 2 from selenium.webdriver.common.by import By 3 from selenium.webdriver.support import expected_conditions as EC 4 from selenium.webdriver.support.ui import WebDriverWait 5 import csv 6 import time 7 import random 8 import requests 9 import parsel 10 11 f = open(\'boss.csv\', mode=\'a\', encoding=\'utf-8-sig\', newline=\'\') 12 csvWriter = csv.DictWriter(f, fieldnames=[ 13 \'标题\', 14 \'地区\', 15 \'薪资\', 16 \'经验\', 17 \'公司名\', 18 \'公司领域\', 19 \'福利\', 20 \'公司规模\', 21 \'详情页\', 22 # \'所需技能\', 23 ]) 24 csvWriter.writeheader() # 写入头 25 26 url = \'https://www.zhipin.com/c100010000/?query=python&ka=sel-city-100010000\' 27 driver = webdriver.Chrome() 28 driver.get(url=url) 29 driver.implicitly_wait(10) 30 driver.maximize_window() # 最大化窗口 31 # lis = driver.find_elements_by_css_selector(\'.job-list ul li\') 32 33 def get_job_details(): 34 lis = driver.find_elements(By.CSS_SELECTOR, \'.job-list ul li\') 35 for li in lis: 36 title = li.find_element(By.CSS_SELECTOR, \'.job-name a\').text # 标题 37 area = li.find_element(By.CSS_SELECTOR, \'.job-area\').text # 地区 38 salary = li.find_element(By.CSS_SELECTOR, \'.job-limit .red\').text # 薪资 39 experience = li.find_element(By.CSS_SELECTOR, \'.job-limit p\').text # 工作经验 40 companyName = li.find_element(By.CSS_SELECTOR, \'.company-text h3 a\').text # 公司名 41 companyStyle = li.find_element(By.CSS_SELECTOR, \'.company-text p a\').text # 公司领域 42 welfare = li.find_element(By.CSS_SELECTOR, \'.info-desc\').text # 公司福利 43 peopleInclude = li.find_element(By.CSS_SELECTOR, \'.company-text p em\').text # 公司规模 44 detailPage = li.find_element(By.CSS_SELECTOR, \'.job-name a\').get_attribute(\'href\') # 详情页 45 # skillsNeed = get_skills_desc(desc_url=detailPage) # 加载窗口 46 print(title, area, salary, experience, companyName, companyStyle, peopleInclude, detailPage, skillsNeed, sep=\' | \') 47 dit = { 48 \'标题\': title, 49 \'地区\': area, 50 \'薪资\': salary, 51 \'经验\': experience, 52 \'公司名\': companyName, 53 \'公司领域\': companyStyle, 54 \'福利\': welfare, 55 \'公司规模\': peopleInclude, 56 \'详情页\': detailPage, 57 # \'所需技能\': skillsNeed, 58 } 59 csvWriter.writerow(dit) # 逐行写入 60 61 # def get_skills_desc(desc_url): 62 # # 获取详情页所需技能的函数 63 # # 加载另一个窗口 64 # js = \'window.open()\' 65 # driver.execute_script(js) # 打开新窗口 66 # driver.switch_to.window(window_name=driver.window_handles[-1]) # 切换到刚打开的窗口 67 # # 开始加载url 68 # driver.get(url=desc_url) # 加载详情页url 69 # driver.implicitly_wait(10) # 等待网页加载完成 70 # skills_need = driver.find_element(By.CSS_SELECTOR, \'.text\') # 直接查找描述字段 71 # return skills_need 72 73 for page in range(1, 100 + 1): 74 print(f\'------------------------正在爬取第{page}页内容----------------------------\') 75 time.sleep(random.uniform(2,5)) 76 get_job_details() # 获取信息 77 next_button = driver.find_element(By.CSS_SELECTOR, \'.page .next\') 78 if next_button: # 如果找到,就表示有下一页 79 next_button.click() # 点击下一页 80 else: 81 print(f\'已经没有了! 第{page}已经是最后一页!\') 82 83 driver.quit() # 退出浏览器
爬虫代码本来想把详情页都爬了,一口气吃成胖子,发现有错误,因为详情页的内容只需要做词云(展示python岗位所需的技能)。所以分开爬,不注重过程,只注重结果。
优化爬虫代码,把详情页的所需技能也爬出来:
1 from selenium import webdriver 2 from selenium.webdriver.common.by import By 3 from selenium.webdriver.support import expected_conditions as EC 4 from selenium.webdriver.support.ui import WebDriverWait 5 import csv 6 import time 7 import random 8 import requests 9 import parsel 10 11 f = open(\'boss.csv\', mode=\'a\', encoding=\'utf-8-sig\', newline=\'\') 12 csvWriter = csv.DictWriter(f, fieldnames=[ 13 \'标题\', 14 \'地区\', 15 \'薪资\', 16 \'经验\', 17 \'公司名\', 18 \'公司领域\', 19 \'福利\', 20 \'公司规模\', 21 \'详情页\', 22 \'所需技能\', 23 ]) 24 csvWriter.writeheader() # 写入头 25 26 url = \'https://www.zhipin.com/c100010000/?query=python&ka=sel-city-100010000\' 27 driver = webdriver.Chrome() 28 driver.get(url=url) 29 driver.implicitly_wait(10) 30 driver.maximize_window() # 最大化窗口 31 # lis = driver.find_elements_by_css_selector(\'.job-list ul li\') 32 33 def get_job_details(): 34 lis = driver.find_elements(By.CSS_SELECTOR, \'.job-list ul li\') 35 for li in lis: 36 title = li.find_element(By.CSS_SELECTOR, \'.job-name a\').text # 标题 37 area = li.find_element(By.CSS_SELECTOR, \'.job-area\').text # 地区 38 salary = li.find_element(By.CSS_SELECTOR, \'.job-limit .red\').text # 薪资 39 experience = li.find_element(By.CSS_SELECTOR, \'.job-limit p\').text # 工作经验 40 companyName = li.find_element(By.CSS_SELECTOR, \'.company-text h3 a\').text # 公司名 41 companyStyle = li.find_element(By.CSS_SELECTOR, \'.company-text p a\').text # 公司领域 42 welfare = li.find_element(By.CSS_SELECTOR, \'.info-desc\').text # 公司福利 43 peopleInclude = li.find_element(By.CSS_SELECTOR, \'.company-text p em\').text # 公司规模 44 detailPage = li.find_element(By.CSS_SELECTOR, \'.job-name a\').get_attribute(\'href\') # 详情页 45 if detailPage: # 判定是否有详情页 46 js = \'window.open()\' 47 driver.execute_script(js) # 打开一个新窗口 48 driver.switch_to.window(window_name=driver.window_handles[-1]) # 切换到最后一个窗口 49 driver.maximize_window() # 最大化创库 50 driver.get(detailPage) # 请求详情页 51 driver.implicitly_wait(10) # 隐式等待 52 skillsNeed = driver.find_element(By.CSS_SELECTOR, \'.text\').text 53 driver.close() # 关闭当前窗口 54 time.sleep(random.uniform(1, 5)) # 随机休眠 55 driver.switch_to.window(driver.window_handles[0]) # 切回第一个窗口 56 print(title, area, salary, experience, companyName, companyStyle, peopleInclude, detailPage, skillsNeed, sep=\' | \') 57 dit = { 58 \'标题\': title, 59 \'地区\': area, 60 \'薪资\': salary, 61 \'经验\': experience, 62 \'公司名\': companyName, 63 \'公司领域\': companyStyle, 64 \'福利\': welfare, 65 \'公司规模\': peopleInclude, 66 \'详情页\': detailPage, 67 \'所需技能\': skillsNeed, 68 } 69 csvWriter.writerow(dit) # 逐行写入 70 71 # def get_skills_desc(desc_url): 72 # # 获取详情页所需技能的函数 73 # # 加载另一个窗口 74 # js = \'window.open()\' 75 # driver.execute_script(js) # 打开新窗口 76 # driver.switch_to.window(window_name=driver.window_handles[-1]) # 切换到刚打开的窗口 77 # # 开始加载url 78 # driver.get(url=desc_url) # 加载详情页url 79 # driver.implicitly_wait(10) # 等待网页加载完成 80 # skills_need = driver.find_element(By.CSS_SELECTOR, \'.text\') # 直接查找描述字段 81 # return skills_need 82 83 for page in range(1, 10 + 1): 84 print(f\'------------------------正在爬取第{page}页内容----------------------------\') 85 time.sleep(random.uniform(2,5)) 86 get_job_details() # 获取信息 87 next_button = driver.find_element(By.CSS_SELECTOR, \'.page .next\') 88 if next_button: # 如果找到,就表示有下一页 89 next_button.click() # 点击下一页 90 else: 91 print(f\'已经没有了! 第{page}已经是最后一页!\') 92 93 driver.quit() # 退出浏览器
做个可视化大屏:
1 import pandas as pd 2 from pyecharts.charts import * 3 import pyecharts.options as opts 4 from pyecharts.globals import ThemeType 5 6 df = pd.read_csv(\'boss.csv\') # 读取csv文档 7 dq = df.groupby(\'地区\').count()[\'标题\'] # 按地区提取数量 8 # print(df[\'地区\'].str[0:2]) 9 # 提取地区的前两位 10 diqu = df[\'地区\'].str[0:2].value_counts() # 统计一下每个地区的招聘数量 11 label = diqu.index.tolist() # 要画图的标签 12 data = diqu.values.tolist() # 要画图的数据 13 14 def bossBar(): 15 b = ( 16 Bar(init_opts=opts.InitOpts(width=\'1080px\', height=\'960px\', theme=ThemeType.LIGHT)) 17 .add_xaxis(label) 18 .add_yaxis(series_name=\'招聘数量\', y_axis=data, bar_max_width=\'50px\',) 19 .set_global_opts( 20 title_opts=opts.TitleOpts(title=\'各地区招聘数量柱状图\'), 21 toolbox_opts=opts.ToolboxOpts(), 22 tooltip_opts=opts.TooltipOpts( 23 trigger=\'axis\', 24 axis_pointer_type=\'cross\', 25 ), 26 ) 27 ) 28 return b 29 30 def bossReverselBar(): 31 b = ( 32 Bar(init_opts=opts.InitOpts(width=\'1080px\', height=\'960px\', theme=ThemeType.LIGHT)) 33 .add_xaxis(label) 34 .add_yaxis(series_name=\'招聘数量\', y_axis=data, label_opts=opts.LabelOpts(is_show=False), bar_max_width=\'100px\') 35 .set_global_opts( 36 title_opts=opts.TitleOpts( 37 title=\'各地区招聘数量反转柱状图\', 38 title_textstyle_opts=opts.TextStyleOpts(font_size=27), 39 ), 40 legend_opts=opts.LegendOpts( 41 pos_left=\'10%\', 42 pos_top=\'10%\', 43 ), 44 tooltip_opts=opts.TooltipOpts( 45 is_show=True, 46 trigger=\'axis\', 47 axis_pointer_type=\'cross\', 48 ), 49 toolbox_opts=opts.ToolboxOpts(), 50 ) 51 .reversal_axis() # 反转坐标轴 52 ) 53 return b 54 55 def bossPie(): 56 p = ( 57 Pie(init_opts=opts.InitOpts(width=\'1580px\', height=\'660px\', theme=ThemeType.LIGHT)) 58 .add(\'\', center=[\'280\', \'330\'], data_pair=[list(z) for z in zip(label, data)]) # 饼图 59 .add(\'\', center=[\'780\', \'330\'], data_pair=[list(z) for z in zip(label, data)], radius=[\'40%\', \'70%\']) # 环图 60 .add(\'\', center=[\'1280\', \'330\'], data_pair=[list(z) for z in zip(label, data)], rosetype=\'radius\') # 南丁格尔图 61 .set_global_opts( 62 title_opts=opts.TitleOpts( 63 title=\'各地区招聘数量饼图\', 64 title_textstyle_opts=opts.TextStyleOpts(font_size=27), 65 ), 66 legend_opts=opts.LegendOpts( 67 pos_left=\'10%\', 68 pos_top=\'10%\', 69 ), 70 ) 71 .set_series_opts(label_opts=opts.LabelOpts( 72 is_show=True, 73 formatter=\'{b}:{c}:{d}\'), 74 ) 75 ) 76 return p 77 78 # 开始制作可视化大屏 79 page = Page(layout=Page.DraggablePageLayout, page_title=\'基于boss直聘的可视化大屏\') #DraggablePageLayout可以调整可视化布局 80 81 page.add( 82 bossBar(), 83 bossReverselBar(), 84 bossPie(), 85 ).render(\'基于boss直聘的可视化大屏.html\') 86 # 上面代码生成可视化可编辑大屏,对图片布局完成之后,要记得点击左上角的save config按钮对布局文件进行保存。之后本地会生成一个chart_config.json文件,然后运行下面的代码: 87 page.save_resize_html(\'可视化大屏设置.html\', cfg_file=\'chart_config.json\', dest=\'可视化大屏.html\')
再做个可视化词云:
1 from stylecloud import gen_stylecloud 2 import pandas as pd 3 import re 4 import jieba 5 6 7 df = pd.read_csv(\'boss.csv\') # 打开文档 8 9 contentList = df[\'所需技能\'].to_list() # 从csv文档里提取出的列表 10 contentStr = \'\'.join(contentList) # 将上面的列表转换成字符串 11 print(contentList) 12 print(contentStr) 13 14 # 对字符串进行处理,替换标点和换行符 15 wordStr = re.sub(\'[0-9a-zA-Z\< =\":\/;\.\_\->\\+]\[;,?#,。!“”、\@...\\'\?\%…\&]\', \'\',contentStr) 16 print(wordStr) 17 # 将处理好的字符串进行jieba分词 18 jiebaWords = jieba.cut(wordStr) # 是个生成器 19 # 将jieba分号的词再转成字符串 20 needWords = \' \'.join(jiebaWords) 21 print(needWords) 22 stopWords = [\'熟练\', \'使用\', \'等\', \'和\', \'家长\', \'招\', \'乐高\', \'老师\', \'及\', \'招\', \'职责\', 23 \'及其\', \'了解\', \'可\', \'选\', \'好\', \'以下\', \'有\', \'会\', \'语言\', \'有过\', \'的\', \'时\', 24 \'者\', \'优先\', \'小\', \'以上\', \'出差\', \'年\', \'或\', \'并\', \'与\', \'进行\', \'至少\', \'内容\', 25 \'能\', \'或者\', \'对\', \'要求\', \'一定\', \'做\', \'工资\', \'其他\', \'职位\', \'云\'] # 与展示节能无关的词 26 27 gen_stylecloud( 28 text=needWords, 29 size=1280, # stylecloud的大小,长度和宽度, 30 font_path=\'C:\\Windows\\Fonts\\simhei.ttf\', # 字体路径 31 max_words=150, # stylecloud中能容的最多词数 32 max_font_size=200, # 最大字号 33 # invert_mask=, # 蒙版 34 custom_stopwords=stopWords, # 停用词 35 output_name=\'1.png\', # 输出的名字 36 )
效果类似如下: