一、爬取qq空间好友动态数据
# 爬取qq空间好友状态信息(说说,好友名称),并屏蔽广告 from selenium import webdriver from time import sleep from lxml import etree # 自动操作浏览器 bro = webdriver.Chrome(executable_path=r\'D:\爬虫+数据分析\tools\chromedriver.exe\') bro.get(\'https://qzone.qq.com/\') sleep(3) #注意:如果想要通过find系列函数去定位某一个iframe标签下的子标签的话,一定要使用如下操作: bro.switch_to.frame(\'login_frame\')#参数表示的是iframe标签的id属性值,,ifram是子标签 bro.find_element_by_id(\'switcher_plogin\').click() # 单击id为switcher_plogin的页面标签,即点击账号密码登录 sleep(3) # 等待数据加载 # 自动输入用户名,密码登录空间 bro.find_element_by_id(\'u\').send_keys(\'1156145880\') bro.find_element_by_id(\'p\').send_keys(\'qq密码\') sleep(3) bro.find_element_by_id(\'login_button\').click() sleep(3) # 点击‘个人中心’,进到好友动态 bro.find_element_by_id(\'aIcenter\').click() sleep(3) # 拖动滚轮,一次一屏;加载动态数据 bro.execute_script(\'window.scrollTo(0,document.body.scrollHeight)\') sleep(3) bro.execute_script(\'window.scrollTo(0,document.body.scrollHeight)\') sleep(3) bro.execute_script(\'window.scrollTo(0,document.body.scrollHeight)\') sleep(3) # 获取浏览器当前的页面源码数据 page_text = bro.page_source # 数据解析 tree = etree.HTML(page_text) li_list = tree.xpath(\'//ul[@id="feed_friend_list"]/li\') for li in li_list: user_name_list = li.xpath(".//div[@class=\'user-info\']/div[@class=\'f-nick\']/a/text()") text_list = li.xpath(\'.//div[@class="f-info"]/text()|.//div[@class="f-info qz_info_cut"]//text()\') # 需要展开的说说类名不同 for tu in zip(user_name_list,text_list): text = \'\n\'.join(tu) print(text+\'\n\n\') bro.close() # 关闭浏览器
二、爬取雪球网的新闻的标题、作者、来源等
import requests import json headers = { \'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36\', } url_index = \'https://xueqiu.com\' url = \'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1\' # 创建一个session对象 session = requests.Session() #使用session进行请求的发送:获取cookie,且将cookie保存到session中 session.get(url_index,headers=headers) # 获取json响应数据 json_dic = session.get(url=url,headers=headers).json() for dic in json_dic["list"]: data = dic["data"] data_dic = json.loads(data) title = data_dic["title"] # description = data_dic["description"] column = dic["column"] author = data_dic["user"]["screen_name"] print(f"标题:{title}\n来源:{column}\n作者:{author}\n")