一.urllib库
urllib是Python自带的一个用于爬虫的库,其主要作用就是可以通过代码模拟浏览器发送请求。其常被用到的子模块在Python3中的为urllib.request和urllib.parse,在Python2中是urllib和urllib2。
''' #出版社爬取 import urllib.request import re data=urllib.request.urlopen("https://read.douban.com/provider/all").read().decode("utf-8") pat='<div class="name">(.*?)</div>' rst=re.compile(pat).findall(data) fh=open("D:\\chubanshe.txt","w") for i in range(0,len(rst)): print(rst[i]) fh.write(rst[i]+"\n") fh.close() ''' ''' #urllib基础 import urllib.request #urlretrieve(网址,本地文件存储地址) 直接下载网页到本地 urllib.request.urlretrieve("http://www.baidu.com","D:\\dld.html") urllib.request.urlcleanup() #看网页相应的简介信息info() file=urllib.request.urlopen("https://read.douban.com/provider/all") print(file.info()) #返回网页爬取的状态码getcode() print(file.getcode()) #获取当前访问的网页的url,geturl() print(file.geturl()) ''' ''' #超时设置 import urllib.request for i in range(0,100): try: file=urllib.request.urlopen("http://www.baidu.com",timeout=1) print(len(file.read().decode("utf-8"))) except Exception as err: print("出现异常"+str(err)) ''' ''' #get请求实战--实现百度信息自动搜索 import urllib.request,re keywd="韦玮" keywd=urllib.request.quote(keywd) #page=(num-1)*10 for i in range(1,11): url="http://www.baidu.com/s?wd="+keywd+"&pn="+str((i-1)*10) data=urllib.request.urlopen(url).read().decode("utf-8") pat="title:'(.*?)'," pat2='"title":"(.*?)",' rst1=re.compile(pat).findall(data) rst2=re.compile(pat2).findall(data) for j in range(0,len(rst1)): print(rst1[j]) for z in range(0,len(rst2)): print(rst2[z]) ''' ''' #post请求实战 import urllib.request import urllib.parse posturl="http://www.iqianyue.com/mypost/" postdata=urllib.parse.urlencode({ "name":"ceo@txk7.com", "pass":"kjsahgjkashg", }).encode("utf-8") #进行post,就需要使用urllib.request下面的Request(真实post地址,post数据) req=urllib.request.Request(posturl,postdata) rst=urllib.request.urlopen(req).read().decode("utf-8") fh=open("D:\\post.html","w") fh.write(rst) fh.close() ''' #异常处理 ''' URLError出现的原因: 1)连不上服务器 2)远程url不存在 3)无网络 4)触发HTTPError ''' ''' import urllib.request import urllib.error try: urllib.request.urlopen("http://blog.csdn.net") except urllib.error.URLError as e: if hasattr(e,"code"): print(e.code) if hasattr(e,"reason"): print(e.reason) ''' ''' #浏览器伪装 import urllib.request url="http://blog.csdn.net" #头文件格式header=("User-Agent",具体用户代理值) headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0") opener=urllib.request.build_opener() opener.addheaders=[headers] data=opener.open(url).read() fh=open("D:\\ua.html","wb") fh.write(data) fh.close() #爬取腾讯新闻首页所有新闻内容 ''' 1、爬取新闻首页 2、得到各新闻链接 3、爬取新闻链接 4、寻找有没有frame 5、若有,抓取frame下对应网页内容 6、若没有,直接抓取当前页面 ''' import urllib.request import re url="http://news.qq.com/" data=urllib.request.urlopen(url).read().decode("UTF-8","ignore") pat1='<a target="_blank" class="linkto" href="(.*?)"' alllink=re.compile(pat1).findall(data) for i in range(0,len(alllink)): thislink=alllink[i] thispage=urllib.request.urlopen(thislink).read().decode("gb2312","ignore") pat2="<frame src=(.*?)>" isframe=re.compile(pat2).findall(thispage) if(len(isframe)==0): #直接爬 print(i) urllib.request.urlretrieve(thislink,"D:\\data\\"+str(i)+".html") else: #得到frame的网址爬 flink=isframe[0] urllib.request.urlretrieve(flink,"D:\\data\\"+str(i)+".html") ''' #CSDN博文爬虫 import urllib.request import re url="http://blog.csdn.net/" headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0") opener=urllib.request.build_opener() opener.addheaders=[headers] #安装为全局 urllib.request.install_opener(opener) data=urllib.request.urlopen(url).read().decode("utf-8","ignore") pat='<h3 class="tracking-ad" data-mod="popu_254"><a href="(.*?)"' alllink=re.compile(pat).findall(data) #print(alllink) for i in range(0,len(alllink)): localpath="D:\\我的教学\\Python\\韬云教育-腾讯-Python爬虫\\rst\\"+str(i)+".html" thislink=alllink[i] urllib.request.urlretrieve(thislink,filename=localpath) print("当前文章(第"+str(i)+"篇)爬取成功!") ''' ''' #糗事百科段子爬虫 import urllib.request import re headers=("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0") opener=urllib.request.build_opener() opener.addheaders=[headers] #安装为全局 urllib.request.install_opener(opener) for i in range(0,35): thisurl="http://www.qiushibaike.com/8hr/page/"+str(i+1)+"/?s=4948859" data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore") pat='<div class="content">.*?<span>(.*?)</span>.*?</div>' rst=re.compile(pat,re.S).findall(data) for j in range(0,len(rst)): print(rst[j]) print("-------") ''' ''' #用户代理池的构建 import urllib.request import re import random uapools=[ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", ] def ua(uapools): thisua=random.choice(uapools) print(thisua) headers=("User-Agent",thisua) opener=urllib.request.build_opener() opener.addheaders=[headers] #安装为全局 urllib.request.install_opener(opener) for i in range(0,35): ua(uapools) thisurl="http://www.qiushibaike.com/8hr/page/"+str(i+1)+"/?s=4948859" data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore") pat='<div class="content">.*?<span>(.*?)</span>.*?</div>' rst=re.compile(pat,re.S).findall(data) for j in range(0,len(rst)): print(rst[j]) print("-------") ''' ''' #IP代理的构建实战 import urllib.request ip="68.13.196.233:8080" proxy=urllib.request.ProxyHandler({"http":ip}) opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler) urllib.request.install_opener(opener) url="http://www.baidu.com" data1=urllib.request.urlopen(url).read() data=data1.decode("utf-8","ignore") print(len(data)) fh=open("D:\\我的教学\\Python\\韬云教育-腾讯-Python爬虫\\rst\\ip_baidu.html","wb") fh.write(data1) fh.close() ''' ''' #IP代理池构建的第一种方案(适合于代理IP稳定的情况) import random import urllib.request ippools=[ "68.13.196.233:8080", "112.247.100.200:9999", "112.247.5.22:9999", ] def ip(ippools): thisip=random.choice(ippools) print(thisip) proxy=urllib.request.ProxyHandler({"http":thisip}) opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler) urllib.request.install_opener(opener) for i in range(0,5): try: ip(ippools) url="http://www.baidu.com" data1=urllib.request.urlopen(url).read() data=data1.decode("utf-8","ignore") print(len(data)) fh=open("D:\\我的教学\\Python\\韬云教育-腾讯-Python爬虫\\rst\\ip_baidu_"+str(i)+".html","wb") fh.write(data1) fh.close() except Exception as err: print(err) ''' ''' #IP代理池实现的第二种方式(接口调用法,这种方法更适合于代理IP不稳定的情况) import urllib.request def api(): print("这一次调用了接口") thisall=urllib.request.urlopen("http://tvp.daxiangdaili.com/ip/?tid=559126871522487&num=10&foreign=only") ippools=[] for item in thisall: ippools.append(item.decode("utf-8","ignore")) return ippools def ip(ippools,time): thisip=ippools[time] print("当前用的IP是:"+ippools[time]) proxy=urllib.request.ProxyHandler({"http":thisip}) opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler) urllib.request.install_opener(opener) x=0 for i in range(0,35): try: if(x%10==0): time=x%10 ippools=api() ip(ippools,time) else: time=x%10 ip(ippools,time) url="http://www.baidu.com" data1=urllib.request.urlopen(url).read() data=data1.decode("utf-8","ignore") print(len(data)) fh=open("D:\\我的教学\\Python\\韬云教育-腾讯-Python爬虫\\rst\\ip_baidu_"+str(i)+".html","wb") fh.write(data1) fh.close() x+=1 except Exception as err: print(err) x+=1 ''' #淘宝商品图片爬虫 import urllib.request import re import random keyname="维维豆奶" key=urllib.request.quote(keyname) uapools=[ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", ] def ua(uapools): thisua=random.choice(uapools) print(thisua) headers=("User-Agent",thisua) opener=urllib.request.build_opener() opener.addheaders=[headers] #安装为全局 urllib.request.install_opener(opener) for i in range(1,101): url="https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44) ua(uapools) data=urllib.request.urlopen(url).read().decode("utf-8","ignore") pat='"pic_url":"//(.*?)"' imglist=re.compile(pat).findall(data) for j in range(0,len(imglist)): thisimg=imglist[j] thisimgurl="http://"+thisimg localfile="D:\\我的教学\\Python\\韬云教育-腾讯-Python爬虫\\rst\\taobao\\dounai\\"+str(i)+str(j)+".jpg" urllib.request.urlretrieve(thisimgurl,filename=localfile) #如何同时使用用户代理池和IP代理池 def ua_ip(myurl): import random uapools=[ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", ] import urllib.request def api(): print("这一次调用了接口") thisall=urllib.request.urlopen("http://tvp.daxiangdaili.com/ip/?tid=559126871522487&num=10&foreign=only&filter=on") ippools=[] for item in thisall: ippools.append(item.decode("utf-8","ignore")) return ippools def ip(ippools,time,uapools): thisua=random.choice(uapools) print(thisua) headers=("User-Agent",thisua) thisip=ippools[time] print("当前用的IP是:"+ippools[time]) proxy=urllib.request.ProxyHandler({"http":thisip}) opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler) opener.addheaders=[headers] urllib.request.install_opener(opener) x=0 for i in range(0,35): try: if(x%10==0): time=x%10 ippools=api() ip(ippools,time,uapools) else: time=x%10 ip(ippools,time,uapools) url=myurl data1=urllib.request.urlopen(url).read() data=data1.decode("utf-8","ignore") print(len(data)) #fh=open("D:\\我的教学\\Python\\韬云教育-腾讯-Python爬虫\\rst\\ip_baidu_"+str(i)+".html","wb") #fh.write(data1) #fh.close() x+=1 break except Exception as err: print(err) x+=1 return data #data=ua_ip("http://www.baidu.com")