# 嗯,。。。因为经常需要使用代理去抓一点东西,就有了下面一段代码,第一版不是很好,后面用到了再来优化
# Author:1043453579
import re,pymysql,time,redis from urllib.request import Request from urllib.request import urlopen headers={ \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36\', } url = \'http://www.xicidaili.com/wt/\' db = pymysql.connect(host=\'127.0.0.1\', user=\'root\', password="liu", database=\'test\', port=3306, charset=\'utf8\') cur = db.cursor() def url_response(url,cur): response = urlopen(Request(url,headers=headers)).read() response = response.decode() pattern=\'<td>(.*?)</td>\s+<td>(\d+)</td>\s+<td>\s+<a href="/.*?">[\u4e00-\u9fa5]+</a>\s+</td>\s+<td class="country">高匿</td>\s+<td>(\w+)</td>\s+<td class="country">\s+<div title="(\d.\d+)秒"\' regex = re.compile(pattern) ip_list = regex.findall(response) for i in ip_list: out_time = float(i[3]) ip_ = i[0]+\':\'+i[1] sql = \'select ip_ from ip where ip_ = "%s"\'%ip_ cur.execute(sql) if cur.fetchone(): print(\'重复数据跳过\') continue if out_time < 1: sql = "insert into ip(ip_,time_,xy_) values(\'%s\',\'%s\',\'%s\')"%(ip_,out_time,i[2]) cur.execute(sql) print(\'插入成功,\',i) else: pass for i in range(1,3): _ = url+str(i) url_response(_,cur) db.commit() time.sleep(2)
# 第一版使用 mysql存着也没啥用处 ,然后就给来了redis # 加入线程池的使用,让抓取更加速度 import re,time,redis from concurrent.futures import ThreadPoolExecutor from urllib.request import Request from urllib.request import urlopen headers={ \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36\', } url = \'http://www.xicidaili.com/wt/\' class R(object): def __init__(self): r_pool =redis.ConnectionPool(host=\'127.0.0.1\',db=0,password=None, port=6379) self.redis_obj = redis.Redis(connection_pool=r_pool) def setex(self,name,value,time): return self.redis_obj.setex(name,value,time) def get(self,name): return self.redis_obj.get(name) def url_response(url,redis_obj): response = urlopen(Request(url,headers=headers)).read() response = response.decode() pattern=\'<td>(.*?)</td>\s+<td>(\d+)</td>\s+<td>\s+<a href="/.*?">[\u4e00-\u9fa5]+</a>\s+</td>\s+<td class="country">高匿</td>\s+<td>(\w+)</td>\s+<td class="country">\s+<div title="(\d.\d+)秒"\' regex = re.compile(pattern) ip_list = regex.findall(response) for i in ip_list: out_time = float(i[3]) ip_ = i[0]+\':\'+i[1] if redis_obj.get(ip_): print(\'重复数据跳过\') continue if out_time < 1: redis_obj.setex(ip_,1,60*30) print(\'插入成功,\',ip_) else: pass r = R() T = ThreadPoolExecutor(4) for i in range(1,5): _ = url+str(i) T.submit(url_response,_,r) print(\'执行完成 \') T.shutdown()
2018-12-17:
第二版出错更新: 15行 :return self.redis_obj.setex(name,time,value) #此处已改正
# python 3.7 from lxml import etree import requests,time,redis class Kuai_IP(object): def __init__(self): self.headers = { \'Host\': \'www.kuaidaili.com\', \'Upgrade-Insecure-Requests\': \'1\', \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36\', \'Referer\': \'https://www.kuaidaili.com/free/inha/1/\' } self.static=\'https://www.kuaidaili.com/free/inha/%s/\' Con_pool = redis.ConnectionPool(host=\'127.0.0.1\') self.r = redis.Redis(connection_pool=Con_pool) def getPage(self,page_index): if page_index==1: self.headers[\'Referer\']=\'https://www.kuaidaili.com/free/inha/\' else: self.headers[\'Referer\']=\'https://www.kuaidaili.com/free/inha/\'+str(page_index-1)+\'/\' res = requests.get(url=self.static%page_index,headers=self.headers) self.parse(res.text) def parse(self,res): html = etree.HTML(res) r_list = html.xpath(\'//tbody/tr/td/text()\') if r_list: for i in range(1, int(len(r_list) / 7)): _ =r_list[i * 7:(i + 1) * 7] self.r.setex(_[3]+\'://\'+_[0]+\':\'+_[1],30,_[3]) print(_) else: print(r_list) def work_on(self): page_index = 2 # 爬取页数 for i in range(1,page_index+1): self.getPage(i) print(i, \'---------\') time.sleep(2) if __name__ == \'__main__\': ip = Kuai_IP() ip.work_on()
2018-12-20
嗯,这次由于用到的代理比较多,就把西刺和快代理的代码合到了一起,没做什么大的改进,
1 代理ip格式全部成为 requests代理的形式{\'http\':\'xxx://xx.xx.xx.xx:xxx\'} 方便requests的调用
# -*- coding:utf-8 -*- # @time:2018-12-20 22:23 import re,redis,time,requests from concurrent.futures import ThreadPoolExecutor from urllib.request import Request from urllib.request import urlopen page = 10 #定义抓取页数,由于快代理一页的ip比较少,然后我就在此的基础上+10 headers={ \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36\', } url = \'http://www.xicidaili.com/wt/\' class R(object): def __init__(self): r_pool =redis.ConnectionPool(host=\'127.0.0.1\',db=0,password=None, port=6379) self.redis_obj = redis.Redis(connection_pool=r_pool) def setex(self,name,value,time): return self.redis_obj.setex(name, value, time) def get(self,name): return self.redis_obj.get(name) def url_response(url,redis_obj): response = urlopen(Request(url,headers=headers)).read() response = response.decode() pattern=\'<td>(.*?)</td>\s+<td>(\d+)</td>\s+<td>\s+<a href="/.*?">[\u4e00-\u9fa5]+</a>\s+</td>\s+<td class="country">高匿</td>\s+<td>(\w+)</td>\s+<td class="country">\s+<div title="(\d.\d+)秒"\' regex = re.compile(pattern) ip_list = regex.findall(response) for i in ip_list: out_time = float(i[3]) ip_ = i[0]+\':\'+i[1] if redis_obj.get(ip_): print(\'重复数据跳过\') continue if out_time < 1: ip_ = "HTTP://"+str(ip_) redis_obj.setex(ip_,1,60*30*20) print(\'插入成功,\',ip_) else: pass from lxml import etree class Kuai_IP(object): def __init__(self): self.headers = { \'Host\': \'www.kuaidaili.com\', \'Upgrade-Insecure-Requests\': \'1\', \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36\', \'Referer\': \'https://www.kuaidaili.com/free/inha/1/\' } self.static=\'https://www.kuaidaili.com/free/inha/%s/\' Con_pool = redis.ConnectionPool(host=\'127.0.0.1\',port=6379,db=0) self.r = redis.Redis(connection_pool=Con_pool) def getPage(self,page_index): if page_index==1: self.headers[\'Referer\']=\'https://www.kuaidaili.com/free/inha/\' else: self.headers[\'Referer\']=\'https://www.kuaidaili.com/free/inha/\'+str(page_index-1)+\'/\' res = requests.get(url=self.static%page_index,headers=self.headers) self.parse(res.text) def parse(self,res): html = etree.HTML(res) r_list = html.xpath(\'//tbody/tr/td/text()\') if r_list: for i in range(1, int(len(r_list) / 7)): _ =r_list[i * 7:(i + 1) * 7] self.r.setex(_[3]+\'://\'+_[0]+\':\'+_[1],_[3],30*60*24) print(_) else: print(r_list) def work_on(self): page_index = page+10 # 爬取页数 for i in range(1,page_index+1): self.getPage(i) print(i, \'---------\') time.sleep(2) if __name__ == \'__main__\': r = R() T = ThreadPoolExecutor(4) for i in range(1,page): _ = url+str(i) T.submit(url_response,_,r) print(\'执行完成 \') T.shutdown() ip = Kuai_IP() ip.work_on()
2018-12-24
嗯,上面代理在80行出现错误,因为快代理的这个超时时间有时出现的是数字+文字,然后在设置缓存的时候就
出现了【 value is not an integer or out of range】
# -*- coding:utf-8 -*- # @time:2018-12-18 22:23 # @Auther:1043453579@qq.com import re,redis,time,requests from concurrent.futures import ThreadPoolExecutor from urllib.request import Request from urllib.request import urlopen page = 10 headers={ \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36\', } url = \'http://www.xicidaili.com/wt/\' class R(object): def __init__(self): r_pool =redis.ConnectionPool(host=\'127.0.0.1\',db=0,password=None, port=6379) self.redis_obj = redis.Redis(connection_pool=r_pool) def setex(self,name,value,time): return self.redis_obj.setex(name, time, value) def get(self,name): return self.redis_obj.get(name) def url_response(url,redis_obj): response = urlopen(Request(url,headers=headers)).read() response = response.decode() pattern=\'<td>(.*?)</td>\s+<td>(\d+)</td>\s+<td>\s+<a href="/.*?">[\u4e00-\u9fa5]+</a>\s+</td>\s+<td class="country">高匿</td>\s+<td>(\w+)</td>\s+<td class="country">\s+<div title="(\d.\d+)秒"\' regex = re.compile(pattern) ip_list = regex.findall(response) for i in ip_list: out_time = float(i[3]) ip_ = i[0]+\':\'+i[1] if redis_obj.get(ip_): print(\'重复数据跳过\') continue if out_time < 1: ip_ = "HTTP://"+str(ip_) redis_obj.setex(ip_,1,60*30*20) print(\'插入成功,\',ip_) else: pass r = R() T = ThreadPoolExecutor(4) for i in range(1,page): _ = url+str(i) T.submit(url_response,_,r) print(\'执行完成 \') T.shutdown() from lxml import etree class Kuai_IP(object): def __init__(self): self.headers = { \'Host\': \'www.kuaidaili.com\', \'Upgrade-Insecure-Requests\': \'1\', \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36\', \'Referer\': \'https://www.kuaidaili.com/free/inha/1/\' } self.static=\'https://www.kuaidaili.com/free/inha/%s/\' Con_pool = redis.ConnectionPool(host=\'127.0.0.1\',port=6379,db=0) self.r = redis.Redis(connection_pool=Con_pool) def getPage(self,page_index): if page_index==1: self.headers[\'Referer\']=\'https://www.kuaidaili.com/free/inha/\' else: self.headers[\'Referer\']=\'https://www.kuaidaili.com/free/inha/\'+str(page_index-1)+\'/\' res = requests.get(url=self.static%page_index,headers=self.headers) self.parse(res.text) def parse(self,res): html = etree.HTML(res) r_list = html.xpath(\'//tbody/tr/td/text()\') if r_list: for i in range(1, int(len(r_list) / 7)): _ =r_list[i * 7:(i + 1) * 7] print(\'1\',_) #exit() self.r.setex(_[3]+\'://\'+_[0]+\':\'+_[1],30*60*24,30*60*24) # print(_) else: print(r_list) def work_on(self): page_index = page+10 # 爬取页数 for i in range(1,page_index+1): self.getPage(i) print(i, \'---------\') time.sleep(2) ip = Kuai_IP() ip.work_on()