Python
1 |
import urllib
|
2 |
urlItem = urllib.urlopen("http://www.baidu.com")
|
3 |
htmSource = urlItem.read()
|
4 |
urlItem.close() |
5 |
print htmSource
|
pycurl
http://pycurl.sourceforge.net/download/
http://pycurl.sourceforge.net/doc/curlobject.html
Python
01 |
import pycurl
|
02 |
c = pycurl.Curl()
|
03 |
c.setopt(pycurl.URL, "http://www.whiledo.com/")
|
04 |
c.setopt(pycurl.HTTPHEADER, ["Accept:"])
|
05 |
import StringIO
|
06 |
b = StringIO.StringIO()
|
07 |
c.setopt(pycurl.WRITEFUNCTION, b.write) |
08 |
c.setopt(pycurl.FOLLOWLOCATION, 1)
|
09 |
c.setopt(pycurl.MAXREDIRS, 5)
|
10 |
c.perform() |
11 |
print b.getvalue()
|
12 |
print c.getinfo(pycurl.INFO_FILETIME)
|
curl_easy_setopt
告诉 libcurl 的如何做事
CURLOPT_WRITEFUNCTION: 写(下载)回传函数,传递一个写指针供外部操作, 一次回调内容大小在 CURL_MAX_WRITE_SIZE (curl.h头文件)中设置
CURLOPT_WRITEDATA: 直接写文件,指定一个文件名如c.setopt(pycurl.WRITEDATA, 'E:\WebSite\py\1.txt') 注win下不能用
CURLOPT_READFUNCTION: 读(上传)回传函数
CURLOPT_SEEKFUNCTION: 数据指针移动,int function(void *instream, curl_off_t offset, int origin);SEEK_SET, SEEK_CUR and SEEK_END,返回CURL_SEEKFUNC_OK或CURL_SEEKFUNC_FAIL或CURL_SEEKFUNC_CANTSEEK (0,1,2)
CURLOPT_OPENSOCKETFUNCTION:
CURLOPT_HEADERFUNCTION:只接收头数据 size_t function( void *ptr, size_t size, size_t nmemb, void *userdata);
CURLOPT_DEBUGFUNCTION: int curl_debug_callback (CURL *, curl_infotype, char *, size_t, void *);
CURLOPT_VERBOSE: 参数设置为1 能显示更多详细信息
CURLOPT_HEADER: 设为 1 将在返回的文本中包含头信息
CURLOPT_NOSIGNAL: 不超时
CURLOPT_FOLLOWLOCATION:设置为1告诉libcurl遵循任何访问
CURLOPT_MAXREDIRS: 设定重定向的数目限制,设置为-1表示无限的重定向(默认)
CURLOPT_PUT:数据上载相关
CURLOPT_POST:
CURLOPT_POSTREDIR:
CURLOPT_POSTFIELDS:
CURLOPT_POSTFIELDSIZE:
CURLOPT_POSTFIELDSIZE_LARGE:
CURLOPT_COPYPOSTFIELDS:
CURLOPT_HTTPPOST:
CURLOPT_UPLOAD:
CURLOPT_AUTOREFERER:libcurl自动设置Referer
CURLOPT_REFERER: 伪造来源路径
CURLOPT_USERAGENT:自定义USERAGENT
CURLOPT_HTTPHEADER:自定义头
CURLOPT_COOKIE: "name1=content1; name2=content2;"
CURLOPT_COOKIEFILE:
CURLOPT_COOKIEJAR:
CURLOPT_COOKIESESSION: 默认情况下,libcurl始终加载和存储所有Cookie
CURLOPT_COOKIELIST
CURLOPT_HTTPGET
CURLOPT_HTTP_VERSION: CURL_HTTP_VERSION_NONE,CURL_HTTP_VERSION_1_0,CURL_HTTP_VERSION_1_1
CURLOPT_IGNORE_CONTENT_LENGTH:忽略内容长度头,针对类似Apache 1.x的服务器
CURLOPT_HTTP_TRANSFER_DECODING:告诉libcurl如何对传输解码,(0,=1)
CURLOPT_HTTP200ALIASES:自定义HTTP 200响应别名,有些服务器对200返回不是标准的
CURLOPT_ENCODING:设置接收的内容编码,同 Accept-Encoding, ('','gzip',....)
CURLOPT_UNRESTRICTED_AUTH:数设置为1,继续发送认证(用户+密码)
NETWORK OPTIONS
CURLOPT_URL: http://xxxx,ftp://xxxx
CURLOPT_PROXY:HTTP代理,主机名或IP地址
CURLOPT_PROXYPORT:代理端口,也可在PROXY的地址后加":端口",如 :8080
CURLOPT_PROXYTYPE:代理类型,CURLPROXY_HTTP(默认), CURLPROXY_HTTP_1_0,CURLPROXY_SOCKS4,CURLPROXY_SOCKS5,CURLPROXY_SOCKS4A,CURLPROXY_SOCKS5_HOSTNAME,
CURLOPT_NOPROXY:不使用代理的域
CURLOPT_HTTPPROXYTUNNEL:
CURLOPT_BUFFERSIZE: libcurl的缓冲区大小(以字节为单位)
(认证)
CURLOPT_NETRC: 此参数控制你的密码,CURL_NETRC_OPTIONAL使用 ~/.netrc 文件, CURL_NETRC_IGNORED(默认):忽略文件,CURL_NETRC_REQUIRED:告诉该文件的使用所需的库,要忽略的URL信息
CURLOPT_NETRC_FILE: 指定 ~/.netrc 文件
CURLOPT_USERNAME:
CURLOPT_USERPWD:
CURLOPT_PASSWORD:
CURLOPT_PROXYUSERNAME:
CURLOPT_PROXYUSERPWD:
CURLOPT_HTTPAUTH:
CURLOPT_PROXYAUTH:
- CURLAUTH_BASIC: HTTP基本验证
- CURLAUTH_DIGEST: HTTP摘要身份验证
- CURLAUTH_DIGEST_IE:
- CURLAUTH_GSSNEGOTIATE: Kerberos5认证 要建立GSS - API
- CURLAUTH_NTLM: NTLM身份验证
- CURLAUTH_ANY: 设置所有选项,ibcurl自动选择一个它认为合适的,安全的验证<
- CURLAUTH_ANYSAFE: 设置基本选项....
- CURLAUTH_ONLY: 强制所有请求使用验证
getinfo
CURLINFO_RESPONSE_CODE: 获得最后收到的HTTP或FTP的代码,如200,404,403,505 代理的CONNECT响应要参考 CURLINFO_HTTP_CONNECTCODE CURLINFO_EFFECTIVE_URL: 最后一次使用有效的URL CURLINFO_HTTP_CONNECTCODE : 长期接受最后收到的代理响应代码 CURLINFO_FILETIME: CURLINFO_TOTAL_TIME: CURLINFO_CONNECT_TIME: CURLINFO_NUM_CONNECTS: 多少个连接 CURLINFO_CONTENT_TYPE: 例:text/html CURLINFO_REQUEST_SIZE: CURLINFO_HEADER_SIZE: CURLINFO_SIZE_DOWNLOAD: 下载总字节量 CURLINFO_SIZE_UPLOAD: CURLINFO_HTTPAUTH_AVAIL: 接收掩码表明身份验证 CURLINFO_PROXYAUTH_AVAIL: 接收掩码表明代理身份验证 CURLINFO_COOKIELIST: 部份使用 INFO_ 如:INFO_COOKIELIST
一个粗糙的共用对象的采集示例
Python
01 |
import pycurl
|
02 |
import StringIO
|
03 |
import string
|
04 |
import random
|
05 |
class spider:
|
06 |
def __init__(self,addHeader = []):
|
07 |
self.httpheader = [
|
08 |
'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
|
09 |
#,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
|
10 |
] + addHeader
|
11 |
self.curl = pycurl.Curl()
|
12 |
self.curl.setopt(pycurl.HTTPHEADER, self.httpheader)
|
13 |
self.curl.setopt(pycurl.REFERER, 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+self.rand_str())
|
14 |
#self.curl.setopt(pycurl.AUTOREFERER, 1)
|
15 |
self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
|
16 |
self.curl.setopt(pycurl.MAXREDIRS, 5)
|
17 |
18 |
def __del__(self):
|
19 |
pass
|
20 |
21 |
def rand_str(self):
|
22 |
return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))
|
23 |
24 |
def tofile(self,url,filename):
|
25 |
fp = open(filename, 'w');
|
26 |
self.curl.setopt(pycurl.URL, url)
|
27 |
self.curl.setopt(pycurl.WRITEFUNCTION, fp.write)
|
28 |
self.curl.perform()
|
29 |
fp.close()
|
30 |
return True
|
31 |
32 |
def html(self, url):
|
33 |
sio = StringIO.StringIO()
|
34 |
self.curl.setopt(pycurl.URL, url)
|
35 |
self.curl.setopt(pycurl.WRITEFUNCTION, sio.write)
|
36 |
self.curl.perform()
|
37 |
reval = sio.getvalue()
|
38 |
sio.close()
|
39 |
return reval
|
40 |
41 |
if __name__ == "__main__":
|
42 |
get = spider(['USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'])
|
43 |
print get.html("http://localhost/spider_for_test.php")
|
44 |
print get.tofile("http://localhost/spider_for_test.php",r'E:\WebSite\wwwroot\test.txt')
|
一个多线程的采集示例
Python
01 |
import pycurl
|
02 |
import threading
|
03 |
import StringIO
|
04 |
import string
|
05 |
import random
|
06 |
class spider:
|
07 |
def __init__(self,referer='',httpheader = []):
|
08 |
self.httpheader = [
|
09 |
'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
|
10 |
,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
|
11 |
] + httpheader
|
12 |
self.referer = referer
|
13 |
def __del__(self):
|
14 |
pass
|
15 |
16 |
def fetch(self,url,stream):
|
17 |
curl = pycurl.Curl()
|
18 |
curl.setopt(pycurl.HTTPHEADER, self.httpheader)
|
19 |
if self.referer == '':
|
20 |
curl.setopt(pycurl.AUTOREFERER, 1)
|
21 |
else:
|
22 |
curl.setopt(pycurl.REFERER, self.referer)
|
23 |
curl.setopt(pycurl.FOLLOWLOCATION, 1)
|
24 |
curl.setopt(pycurl.MAXREDIRS, 5)
|
25 |
curl.setopt(pycurl.URL, url)
|
26 |
curl.setopt(pycurl.WRITEFUNCTION, stream.write)
|
27 |
curl.perform()
|
28 |
curl.close()
|
29 |
30 |
def rand_str(self):
|
31 |
return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))
|
32 |
33 |
def tofile(self,url,filename):
|
34 |
fp = open(filename, 'w');
|
35 |
self.fetch(url,fp)
|
36 |
fp.close()
|
37 |
return True
|
38 |
39 |
def html(self, url):
|
40 |
sio = StringIO.StringIO()
|
41 |
self.fetch(url,sio)
|
42 |
reval = sio.getvalue()
|
43 |
sio.close()
|
44 |
return reval
|
45 |
46 |
def gethtml(url,get):
|
47 |
print get.html(url)
|
48 |
49 |
if __name__ == "__main__":
|
50 |
import time,datetime
|
51 |
dstart = datetime.datetime.now()
|
52 |
get = spider()
|
53 |
get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()
|
54 |
thread_pool = []
|
55 |
acc = Account(100)
|
56 |
for i in range(10):
|
57 |
url = "http://localhost/test.php?n="+str(i)
|
58 |
th = threading.Thread(target=gethtml,args=(url,get))
|
59 |
thread_pool.append(th)
|
60 |
for i in range(10):
|
61 |
thread_pool[i].start()
|
62 |
for i in range(10):
|
63 |
threading.Thread.join(thread_pool[i])
|
64 |
dend = datetime.datetime.now()
|
65 |
print "Time span:" , dend-dstart;
|
WDPYSPIDER类(支持,多线程,代理,登陆验证,POST)
Python
001 |
#coding:utf-8 |
002 |
import pycurl
|
003 |
import urllib
|
004 |
import threading
|
005 |
import StringIO
|
006 |
import string
|
007 |
import random
|
008 |
class spider:
|
009 |
'''WDPYSPIDER(Whiledo Python Spider Class) 采集类
|
010 |
011 |
@author HzqGhost admin@whiledo.com QQ:313143468
|
012 |
get = spider()
|
013 |
get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()
|
014 |
get.proxyuse = True
|
015 |
get.proxyip = ['059148233056.ctinets.com:80']
|
016 |
url = "http://www.whiledo.com"
|
017 |
print get.html(url=url)'''
|
018 |
def __init__(self):
|
019 |
self.httpheader = [
|
020 |
'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
|
021 |
,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
|
022 |
] #http头信息
|
023 |
self.referer = '' #伪造来源路径
|
024 |
self.connnecttimeout = 60 #获取联接超时(秒)
|
025 |
self.timeout = 300 #读定超时(秒)
|
026 |
self.backheader = 0 #是否返回服务器http头信息(一般用于测试)
|
027 |
self.cookesfile = "./cookes.dat" #cookesfile 自动读写处理文件
|
028 |
self.proxyuse = False #是否使用代理服务器
|
029 |
self.proxyip = [] #代理服务器[IP:PORT]列表,随机使用列表中的IP
|
030 |
self.proxynodomain = ['localhost','127.0.0.1'] #不使用代理服务器的域
|
031 |
self.http200alias = [] #200返回信息别名列表
|
032 |
self.error = 'WDPYERROR' #非200状态时返回的错误标识
|
033 |
def __del__(self):
|
034 |
pass
|
035 |
036 |
def fetch(self,url,stream, post={}):
|
037 |
'''
|
038 |
--url
|
039 |
--stream [stream] StringIO or fp
|
040 |
--post [dict] {'username':'hzq','password':'blog'}'''
|
041 |
curl = pycurl.Curl()
|
042 |
curl.setopt(pycurl.CONNECTTIMEOUT, self.connnecttimeout)
|
043 |
curl.setopt(pycurl.TIMEOUT, self.timeout)
|
044 |
curl.setopt(pycurl.HTTPHEADER, self.httpheader)
|
045 |
curl.setopt(pycurl.HTTP200ALIASES, self.http200alias)
|
046 |
curl.setopt(pycurl.HEADER, self.backheader)
|
047 |
curl.setopt(pycurl.FOLLOWLOCATION, 1)
|
048 |
curl.setopt(pycurl.MAXREDIRS, 5)
|
049 |
if self.referer == '':
|
050 |
curl.setopt(pycurl.AUTOREFERER, 1)
|
051 |
else:
|
052 |
curl.setopt(pycurl.REFERER, self.referer)
|
053 |
curl.setopt(pycurl.COOKIEJAR, self.cookesfile)
|
054 |
curl.setopt(pycurl.COOKIEFILE, self.cookesfile)
|
055 |
curl.setopt(pycurl.WRITEFUNCTION, stream.write)
|
056 |
curl.setopt(pycurl.URL, url)
|
057 |
if self.proxyuse:
|
058 |
proxyip = self.proxyip[random.randint(0,len(self.proxyip)-1)];
|
059 |
curl.setopt(pycurl.PROXY, proxyip)
|
060 |
#curl.setopt(pycurl.PROXYNO, self.proxynodomain) #需要7.19.4 的pycurl版本
|
061 |
if len(post)>0 :
|
062 |
curl.setopt(pycurl.POSTFIELDS, post)
|
063 |
status = ''
|
064 |
try:
|
065 |
curl.perform()
|
066 |
status = curl.getinfo(pycurl.RESPONSE_CODE)
|
067 |
except:
|
068 |
status = curl.errstr()
|
069 |
finally:
|
070 |
curl.close()
|
071 |
status = str(status);
|
072 |
if status != '200':
|
073 |
status = self.error
|
074 |
return status;
|
075 |
076 |
def rand_str(self):
|
077 |
return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))
|
078 |
079 |
def tofile(self,url,filename, post={}):
|
080 |
fp = open(filename, 'wb');
|
081 |
self.fetch(url,fp,post)
|
082 |
fp.close()
|
083 |
return True
|
084 |
085 |
def html(self, url, post={}):
|
086 |
sio = StringIO.StringIO()
|
087 |
reval = self.fetch(url,sio, post)
|
088 |
if reval == '200':
|
089 |
reval = sio.getvalue()
|
090 |
sio.close()
|
091 |
return reval
|
092 |
093 |
def gethtml(url,get):
|
094 |
print get.html(url)
|
095 |
096 |
if __name__ == "__main__":
|
097 |
get = spider()
|
098 |
get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()
|
099 |
get.proxyuse = True
|
100 |
get.proxyip = ['059148233056.ctinets.com:80']
|
101 |
url = "http://www.whiledo.com"
|
102 |
print get.html(url=url)
|
http://www.cnblogs.com/huangcong/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。
Python
1 |
import urllib
|
2 |
urlItem = urllib.urlopen("http://www.baidu.com")
|
3 |
htmSource = urlItem.read()
|
4 |
urlItem.close() |
5 |
print htmSource
|
pycurl
http://pycurl.sourceforge.net/download/
http://pycurl.sourceforge.net/doc/curlobject.html
Python
01 |
import pycurl
|
02 |
c = pycurl.Curl()
|
03 |
c.setopt(pycurl.URL, "http://www.whiledo.com/")
|
04 |
c.setopt(pycurl.HTTPHEADER, ["Accept:"])
|
05 |
import StringIO
|
06 |
b = StringIO.StringIO()
|
07 |
c.setopt(pycurl.WRITEFUNCTION, b.write) |
08 |
c.setopt(pycurl.FOLLOWLOCATION, 1)
|
09 |
c.setopt(pycurl.MAXREDIRS, 5)
|
10 |
c.perform() |
11 |
print b.getvalue()
|
12 |
print c.getinfo(pycurl.INFO_FILETIME)
|
curl_easy_setopt
告诉 libcurl 的如何做事
CURLOPT_WRITEFUNCTION: 写(下载)回传函数,传递一个写指针供外部操作, 一次回调内容大小在 CURL_MAX_WRITE_SIZE (curl.h头文件)中设置
CURLOPT_WRITEDATA: 直接写文件,指定一个文件名如c.setopt(pycurl.WRITEDATA, 'E:\WebSite\py\1.txt') 注win下不能用
CURLOPT_READFUNCTION: 读(上传)回传函数
CURLOPT_SEEKFUNCTION: 数据指针移动,int function(void *instream, curl_off_t offset, int origin);SEEK_SET, SEEK_CUR and SEEK_END,返回CURL_SEEKFUNC_OK或CURL_SEEKFUNC_FAIL或CURL_SEEKFUNC_CANTSEEK (0,1,2)
CURLOPT_OPENSOCKETFUNCTION:
CURLOPT_HEADERFUNCTION:只接收头数据 size_t function( void *ptr, size_t size, size_t nmemb, void *userdata);
CURLOPT_DEBUGFUNCTION: int curl_debug_callback (CURL *, curl_infotype, char *, size_t, void *);
CURLOPT_VERBOSE: 参数设置为1 能显示更多详细信息
CURLOPT_HEADER: 设为 1 将在返回的文本中包含头信息
CURLOPT_NOSIGNAL: 不超时
CURLOPT_FOLLOWLOCATION:设置为1告诉libcurl遵循任何访问
CURLOPT_MAXREDIRS: 设定重定向的数目限制,设置为-1表示无限的重定向(默认)
CURLOPT_PUT:数据上载相关
CURLOPT_POST:
CURLOPT_POSTREDIR:
CURLOPT_POSTFIELDS:
CURLOPT_POSTFIELDSIZE:
CURLOPT_POSTFIELDSIZE_LARGE:
CURLOPT_COPYPOSTFIELDS:
CURLOPT_HTTPPOST:
CURLOPT_UPLOAD:
CURLOPT_AUTOREFERER:libcurl自动设置Referer
CURLOPT_REFERER: 伪造来源路径
CURLOPT_USERAGENT:自定义USERAGENT
CURLOPT_HTTPHEADER:自定义头
CURLOPT_COOKIE: "name1=content1; name2=content2;"
CURLOPT_COOKIEFILE:
CURLOPT_COOKIEJAR:
CURLOPT_COOKIESESSION: 默认情况下,libcurl始终加载和存储所有Cookie
CURLOPT_COOKIELIST
CURLOPT_HTTPGET
CURLOPT_HTTP_VERSION: CURL_HTTP_VERSION_NONE,CURL_HTTP_VERSION_1_0,CURL_HTTP_VERSION_1_1
CURLOPT_IGNORE_CONTENT_LENGTH:忽略内容长度头,针对类似Apache 1.x的服务器
CURLOPT_HTTP_TRANSFER_DECODING:告诉libcurl如何对传输解码,(0,=1)
CURLOPT_HTTP200ALIASES:自定义HTTP 200响应别名,有些服务器对200返回不是标准的
CURLOPT_ENCODING:设置接收的内容编码,同 Accept-Encoding, ('','gzip',....)
CURLOPT_UNRESTRICTED_AUTH:数设置为1,继续发送认证(用户+密码)
NETWORK OPTIONS
CURLOPT_URL: http://xxxx,ftp://xxxx
CURLOPT_PROXY:HTTP代理,主机名或IP地址
CURLOPT_PROXYPORT:代理端口,也可在PROXY的地址后加":端口",如 :8080
CURLOPT_PROXYTYPE:代理类型,CURLPROXY_HTTP(默认), CURLPROXY_HTTP_1_0,CURLPROXY_SOCKS4,CURLPROXY_SOCKS5,CURLPROXY_SOCKS4A,CURLPROXY_SOCKS5_HOSTNAME,
CURLOPT_NOPROXY:不使用代理的域
CURLOPT_HTTPPROXYTUNNEL:
CURLOPT_BUFFERSIZE: libcurl的缓冲区大小(以字节为单位)
(认证)
CURLOPT_NETRC: 此参数控制你的密码,CURL_NETRC_OPTIONAL使用 ~/.netrc 文件, CURL_NETRC_IGNORED(默认):忽略文件,CURL_NETRC_REQUIRED:告诉该文件的使用所需的库,要忽略的URL信息
CURLOPT_NETRC_FILE: 指定 ~/.netrc 文件
CURLOPT_USERNAME:
CURLOPT_USERPWD:
CURLOPT_PASSWORD:
CURLOPT_PROXYUSERNAME:
CURLOPT_PROXYUSERPWD:
CURLOPT_HTTPAUTH:
CURLOPT_PROXYAUTH:
- CURLAUTH_BASIC: HTTP基本验证
- CURLAUTH_DIGEST: HTTP摘要身份验证
- CURLAUTH_DIGEST_IE:
- CURLAUTH_GSSNEGOTIATE: Kerberos5认证 要建立GSS - API
- CURLAUTH_NTLM: NTLM身份验证
- CURLAUTH_ANY: 设置所有选项,ibcurl自动选择一个它认为合适的,安全的验证<
- CURLAUTH_ANYSAFE: 设置基本选项....
- CURLAUTH_ONLY: 强制所有请求使用验证
getinfo
CURLINFO_RESPONSE_CODE: 获得最后收到的HTTP或FTP的代码,如200,404,403,505 代理的CONNECT响应要参考 CURLINFO_HTTP_CONNECTCODE CURLINFO_EFFECTIVE_URL: 最后一次使用有效的URL CURLINFO_HTTP_CONNECTCODE : 长期接受最后收到的代理响应代码 CURLINFO_FILETIME: CURLINFO_TOTAL_TIME: CURLINFO_CONNECT_TIME: CURLINFO_NUM_CONNECTS: 多少个连接 CURLINFO_CONTENT_TYPE: 例:text/html CURLINFO_REQUEST_SIZE: CURLINFO_HEADER_SIZE: CURLINFO_SIZE_DOWNLOAD: 下载总字节量 CURLINFO_SIZE_UPLOAD: CURLINFO_HTTPAUTH_AVAIL: 接收掩码表明身份验证 CURLINFO_PROXYAUTH_AVAIL: 接收掩码表明代理身份验证 CURLINFO_COOKIELIST: 部份使用 INFO_ 如:INFO_COOKIELIST
一个粗糙的共用对象的采集示例
Python
01 |
import pycurl
|
02 |
import StringIO
|
03 |
import string
|
04 |
import random
|
05 |
class spider:
|
06 |
def __init__(self,addHeader = []):
|
07 |
self.httpheader = [
|
08 |
'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
|
09 |
#,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
|
10 |
] + addHeader
|
11 |
self.curl = pycurl.Curl()
|
12 |
self.curl.setopt(pycurl.HTTPHEADER, self.httpheader)
|
13 |
self.curl.setopt(pycurl.REFERER, 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+self.rand_str())
|
14 |
#self.curl.setopt(pycurl.AUTOREFERER, 1)
|
15 |
self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
|
16 |
self.curl.setopt(pycurl.MAXREDIRS, 5)
|
17 |
18 |
def __del__(self):
|
19 |
pass
|
20 |
21 |
def rand_str(self):
|
22 |
return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))
|
23 |
24 |
def tofile(self,url,filename):
|
25 |
fp = open(filename, 'w');
|
26 |
self.curl.setopt(pycurl.URL, url)
|
27 |
self.curl.setopt(pycurl.WRITEFUNCTION, fp.write)
|
28 |
self.curl.perform()
|
29 |
fp.close()
|
30 |
return True
|
31 |
32 |
def html(self, url):
|
33 |
sio = StringIO.StringIO()
|
34 |
self.curl.setopt(pycurl.URL, url)
|
35 |
self.curl.setopt(pycurl.WRITEFUNCTION, sio.write)
|
36 |
self.curl.perform()
|
37 |
reval = sio.getvalue()
|
38 |
sio.close()
|
39 |
return reval
|
40 |
41 |
if __name__ == "__main__":
|
42 |
get = spider(['USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'])
|
43 |
print get.html("http://localhost/spider_for_test.php")
|
44 |
print get.tofile("http://localhost/spider_for_test.php",r'E:\WebSite\wwwroot\test.txt')
|
一个多线程的采集示例
Python
01 |
import pycurl
|
02 |
import threading
|
03 |
import StringIO
|
04 |
import string
|
05 |
import random
|
06 |
class spider:
|
07 |
def __init__(self,referer='',httpheader = []):
|
08 |
self.httpheader = [
|
09 |
'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
|
10 |
,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
|
11 |
] + httpheader
|
12 |
self.referer = referer
|
13 |
def __del__(self):
|
14 |
pass
|
15 |
16 |
def fetch(self,url,stream):
|
17 |
curl = pycurl.Curl()
|
18 |
curl.setopt(pycurl.HTTPHEADER, self.httpheader)
|
19 |
if self.referer == '':
|
20 |
curl.setopt(pycurl.AUTOREFERER, 1)
|
21 |
else:
|
22 |
curl.setopt(pycurl.REFERER, self.referer)
|
23 |
curl.setopt(pycurl.FOLLOWLOCATION, 1)
|
24 |
curl.setopt(pycurl.MAXREDIRS, 5)
|
25 |
curl.setopt(pycurl.URL, url)
|
26 |
curl.setopt(pycurl.WRITEFUNCTION, stream.write)
|
27 |
curl.perform()
|
28 |
curl.close()
|
29 |
30 |
def rand_str(self):
|
31 |
return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))
|
32 |
33 |
def tofile(self,url,filename):
|
34 |
fp = open(filename, 'w');
|
35 |
self.fetch(url,fp)
|
36 |
fp.close()
|
37 |
return True
|
38 |
39 |
def html(self, url):
|
40 |
sio = StringIO.StringIO()
|
41 |
self.fetch(url,sio)
|
42 |
reval = sio.getvalue()
|
43 |
sio.close()
|
44 |
return reval
|
45 |
46 |
def gethtml(url,get):
|
47 |
print get.html(url)
|
48 |
49 |
if __name__ == "__main__":
|
50 |
import time,datetime
|
51 |
dstart = datetime.datetime.now()
|
52 |
get = spider()
|
53 |
get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()
|
54 |
thread_pool = []
|
55 |
acc = Account(100)
|
56 |
for i in range(10):
|
57 |
url = "http://localhost/test.php?n="+str(i)
|
58 |
th = threading.Thread(target=gethtml,args=(url,get))
|
59 |
thread_pool.append(th)
|
60 |
for i in range(10):
|
61 |
thread_pool[i].start()
|
62 |
for i in range(10):
|
63 |
threading.Thread.join(thread_pool[i])
|
64 |
dend = datetime.datetime.now()
|
65 |
print "Time span:" , dend-dstart;
|
WDPYSPIDER类(支持,多线程,代理,登陆验证,POST)
Python
001 |
#coding:utf-8 |
002 |
import pycurl
|
003 |
import urllib
|
004 |
import threading
|
005 |
import StringIO
|
006 |
import string
|
007 |
import random
|
008 |
class spider:
|
009 |
'''WDPYSPIDER(Whiledo Python Spider Class) 采集类
|
010 |
011 |
@author HzqGhost admin@whiledo.com QQ:313143468
|
012 |
get = spider()
|
013 |
get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()
|
014 |
get.proxyuse = True
|
015 |
get.proxyip = ['059148233056.ctinets.com:80']
|
016 |
url = "http://www.whiledo.com"
|
017 |
print get.html(url=url)'''
|
018 |
def __init__(self):
|
019 |
self.httpheader = [
|
020 |
'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'
|
021 |
,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'
|
022 |
] #http头信息
|
023 |
self.referer = '' #伪造来源路径
|
024 |
self.connnecttimeout = 60 #获取联接超时(秒)
|
025 |
self.timeout = 300 #读定超时(秒)
|
026 |
self.backheader = 0 #是否返回服务器http头信息(一般用于测试)
|
027 |
self.cookesfile = "./cookes.dat" #cookesfile 自动读写处理文件
|
028 |
self.proxyuse = False #是否使用代理服务器
|
029 |
self.proxyip = [] #代理服务器[IP:PORT]列表,随机使用列表中的IP
|
030 |
self.proxynodomain = ['localhost','127.0.0.1'] #不使用代理服务器的域
|
031 |
self.http200alias = [] #200返回信息别名列表
|
032 |
self.error = 'WDPYERROR' #非200状态时返回的错误标识
|
033 |
def __del__(self):
|
034 |
pass
|
035 |
036 |
def fetch(self,url,stream, post={}):
|
037 |
'''
|
038 |
--url
|
039 |
--stream [stream] StringIO or fp
|
040 |
--post [dict] {'username':'hzq','password':'blog'}'''
|
041 |
curl = pycurl.Curl()
|
042 |
curl.setopt(pycurl.CONNECTTIMEOUT, self.connnecttimeout)
|
043 |
curl.setopt(pycurl.TIMEOUT, self.timeout)
|
044 |
curl.setopt(pycurl.HTTPHEADER, self.httpheader)
|
045 |
curl.setopt(pycurl.HTTP200ALIASES, self.http200alias)
|
046 |
curl.setopt(pycurl.HEADER, self.backheader)
|
047 |
curl.setopt(pycurl.FOLLOWLOCATION, 1)
|
048 |
curl.setopt(pycurl.MAXREDIRS, 5)
|
049 |
if self.referer == '':
|
050 |
curl.setopt(pycurl.AUTOREFERER, 1)
|
051 |
else:
|
052 |
curl.setopt(pycurl.REFERER, self.referer)
|
053 |
curl.setopt(pycurl.COOKIEJAR, self.cookesfile)
|
054 |
curl.setopt(pycurl.COOKIEFILE, self.cookesfile)
|
055 |
curl.setopt(pycurl.WRITEFUNCTION, stream.write)
|
056 |
curl.setopt(pycurl.URL, url)
|
057 |
if self.proxyuse:
|
058 |
proxyip = self.proxyip[random.randint(0,len(self.proxyip)-1)];
|
059 |
curl.setopt(pycurl.PROXY, proxyip)
|
060 |
#curl.setopt(pycurl.PROXYNO, self.proxynodomain) #需要7.19.4 的pycurl版本
|
061 |
if len(post)>0 :
|
062 |
curl.setopt(pycurl.POSTFIELDS, post)
|
063 |
status = ''
|
064 |
try:
|
065 |
curl.perform()
|
066 |
status = curl.getinfo(pycurl.RESPONSE_CODE)
|
067 |
except:
|
068 |
status = curl.errstr()
|
069 |
finally:
|
070 |
curl.close()
|
071 |
status = str(status);
|
072 |
if status != '200':
|
073 |
status = self.error
|
074 |
return status;
|
075 |
076 |
def rand_str(self):
|
077 |
return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))
|
078 |
079 |
def tofile(self,url,filename, post={}):
|
080 |
fp = open(filename, 'wb');
|
081 |
self.fetch(url,fp,post)
|
082 |
fp.close()
|
083 |
return True
|
084 |
085 |
def html(self, url, post={}):
|
086 |
sio = StringIO.StringIO()
|
087 |
reval = self.fetch(url,sio, post)
|
088 |
if reval == '200':
|
089 |
reval = sio.getvalue()
|
090 |
sio.close()
|
091 |
return reval
|
092 |
093 |
def gethtml(url,get):
|
094 |
print get.html(url)
|
095 |
096 |
if __name__ == "__main__":
|
097 |
get = spider()
|
098 |
get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()
|
099 |
get.proxyuse = True
|
100 |
get.proxyip = ['059148233056.ctinets.com:80']
|
101 |
url = "http://www.whiledo.com"
|
102 |
print get.html(url=url)
|