一、介绍
本例子用Selenium +phantomjs爬取中文网站总排名(http://top.chinaz.com/all/index.html,http://top.chinaz.com/hangye/index.html)的信息
二、网站信息
三、数据抓取
针对上面的网站信息,来进行抓取
1、首先抓取信息列表
抓取代码:Elements = doc('li[class^="clearfix"]') 类似信息
2、网站名称,域名,网址
netElement = element.find('div[class="CentTxt"]').find('h3').find('a')
netName = netElement.attr('title').encode('utf8')
netUrl = 'http://top.chinaz.com' + netElement.attr('href').encode('utf8')
domainName = element.find('div[class="CentTxt"]').find('h3').find('span').text().encode('utf8')
3、Alexa周排名,反链数,网站描述
netinfo = element.find('div[class="RtCPart clearfix"]').text().encode('utf8')
4、排名,得分
netOrder = element.find('div[class="RtCRateCent"]').find('strong').text().encode('utf8')
netScore = element.find('div[class="RtCRateCent"]').find('span').text().encode('utf8')
四、具体思路
对于抓取分类网站排名信息思路如:
1、首先抓取一级分类名称(包括拼音和中文名称),分类对应的url, 该分类数据的总页数,保存到数据表category
2、根据分类数据的总页数,计算出该分类的所有网页的url, 并把 category, cncategory, url, index,iscrawl(是否抓取,默认是0:未抓取,1:已抓取) 保存至数据表categoryall中
3、通过获取categoryall表中每种分类的url,开始抓取每页数据,每页抓取完成后修改iscrawl =1,避免重复抓取。抓取每个数据后把相关数据保存到categorydata表中,
在保存之前通过 netname,domainname,category三个字段来判断是否有重复数据,避免保存重复。
3、二级分类网站排名信息同上。
五、实现代码
# coding=utf-8 import os import re from selenium import webdriver from datetime import datetime,timedelta import selenium.webdriver.support.ui as ui from selenium.common.exceptions import TimeoutException import time from pyquery import PyQuery as pq import re import mongoDB import hyData class hyTotal: def __init__(self): #通过配置文件获取IEDriverServer.exe路径 # IEDriverServer ='C:\Program Files\Internet Explorer\IEDriverServer.exe' # self.driver = webdriver.Ie(IEDriverServer) # self.driver.maximize_window() self.driver = webdriver.PhantomJS(service_args=['--load-images=false']) self.driver.set_page_load_timeout(10) self.driver.maximize_window() self.db = mongoDB.mongoDbBase() def saveurls(self): urls = [] urls.append('http://top.chinaz.com/all/index.html') for i in range(2, 1888): urls.append('http://top.chinaz.com/all/index_' + str(i) + '.html') self.db.SaveTotalUrls(urls) def WriteLog(self, message,date,dir): fileName = os.path.join(os.getcwd(), 'hangye/total/' + date + '-' + dir + '.txt') with open(fileName, 'a') as f: f.write(message) def CatchData(self): urlIndex = 0 urls = self.db.GetTotalUrls() # {'_id': 0, 'url': 1,'category':1,'index':1} lst = [] for u in urls: url = u['url'] urlIndex = u['index'] lst.append('{0},{1}'.format(url,urlIndex)) urls.close() for u in lst: url = u.split(',')[0] urlIndex = u.split(',')[1] try: self.driver.get(url) time.sleep(2) selenium_html = self.driver.execute_script("return document.documentElement.outerHTML") doc = pq(selenium_html) # Elements = doc("div[@id='text_box_0']/dl/dd") Elements = doc('li[class^="clearfix"]') message = '' # for element in Elements: data_list = [] for element in Elements.items(): hyobj = hyData.hyData() # column1Element = element.find('div[@class="text"]/h3/a') # 网站名称,域名,网址 # netElement = element.find_element_by_xpath("//div[@class='CentTxt']/h3/a") netElement = element.find('div[class="CentTxt"]').find('h3').find('a') # netName = nextElement.get_attribute('title').encode('utf8') netName = netElement.attr('title').encode('utf8').replace('\n','').replace(',',',') netUrl = 'http://top.chinaz.com' + netElement.attr('href').encode('utf8') # domainName = element.find_element_by_xpath('//div[class="CentTxt"]/h3/span').text.encode('utf8') domainName = element.find('div[class="CentTxt"]').find('h3').find('span').text().encode('utf8') # Alexa周排名,反链数,网站描述 netinfo = element.find('div[class="RtCPart clearfix"]').text().encode('utf8') pattern = re.compile("\d+") ts = pattern.findall(netinfo) alexaOrder = '' antichainCount = '' if ts and len(ts) == 2: alexaOrder = ts[0] antichainCount = ts[1] netDescription = element.find('p[class="RtCInfo"]').text().encode('utf8') netDescription = netDescription.replace('网站简介:', '').replace(',', ',').replace('\n', '') # 排名,得分 netOrder = element.find('div[class="RtCRateCent"]').find('strong').text().encode('utf8') netScore = element.find('div[class="RtCRateCent"]').find('span').text().encode('utf8') netScore = netScore.replace('得分:', '') hyobj.setNetName(netName) hyobj.setDomainName(domainName) hyobj.setNetRank(netOrder) hyobj.setNetScore(netScore) # print alexaOrder hyobj.setAlexaRank(alexaOrder) hyobj.setAntichainCount(antichainCount) hyobj.setNetUrl(netUrl) hyobj.setNetDescription(netDescription) hyobj.setPageIndex(urlIndex) data_list.append(hyobj) self.db.SaveTotalData(data_list) self.db.SetTotalState(url) print url except TimeoutException,e: print 'timeout url: '+url self.driver.close() self.driver.quit() def exportHyTotal(self): dataRows = self.db.GetTotalData() for dataRow in dataRows: pageIndex = int(dataRow['pageIndex'].encode('utf8')) netname = dataRow['netname'].encode('utf8') domainname = dataRow['domainname'].encode('utf8') netrank = dataRow['netrank'] netscore = dataRow['netscore'] alexarank = dataRow['alexarank'] antichaincount = dataRow['antichaincount'] neturl = dataRow['neturl'].encode('utf8') netdescription = dataRow['netdescription'].encode('utf8') # 网站名称,网站域名,排名,得分,alexa排名,反链数,网站简介,网站类型 message = '\n{0},{1},{2},{3},{4},{5},{6},{7}'.format(netname, domainname, netrank, netscore, alexarank, antichaincount, neturl, netdescription) date = time.strftime('%Y-%m-%d') dir = str((pageIndex/50+1)*50) self.WriteLog(message,date,dir) obj = hyTotal() # obj.saveurls() # obj.CatchData() obj.exportHyTotal()