Python网站采集功能（多线程的采集、WDPYSPIDER类、pycurl）

Python

1

import urllib

2

urlItem = urllib.urlopen("http://www.baidu.com")

3

htmSource = urlItem.read()

4
urlItem.close()

5

print htmSource

pycurl

http://pycurl.sourceforge.net/download/

http://pycurl.sourceforge.net/doc/curlobject.html

Python

01

import pycurl

02

c = pycurl.Curl()

03

c.setopt(pycurl.URL, "http://www.whiledo.com/")

04

c.setopt(pycurl.HTTPHEADER, ["Accept:"])

05

import StringIO

06

b = StringIO.StringIO()

07
c.setopt(pycurl.WRITEFUNCTION, b.write)

08

c.setopt(pycurl.FOLLOWLOCATION, 1)

09

c.setopt(pycurl.MAXREDIRS, 5)

10
c.perform()

11

print b.getvalue()

12

print c.getinfo(pycurl.INFO_FILETIME)

curl_easy_setopt

告诉 libcurl 的如何做事
CURLOPT_WRITEFUNCTION: 写(下载)回传函数,传递一个写指针供外部操作, 一次回调内容大小在 CURL_MAX_WRITE_SIZE (curl.h头文件)中设置
CURLOPT_WRITEDATA: 直接写文件,指定一个文件名如c.setopt(pycurl.WRITEDATA, 'E:\WebSite\py\1.txt') 注win下不能用
CURLOPT_READFUNCTION: 读(上传)回传函数
CURLOPT_SEEKFUNCTION: 数据指针移动,int function(void *instream, curl_off_t offset, int origin);SEEK_SET, SEEK_CUR and SEEK_END,返回CURL_SEEKFUNC_OK或CURL_SEEKFUNC_FAIL或CURL_SEEKFUNC_CANTSEEK (0,1,2)
CURLOPT_OPENSOCKETFUNCTION:
CURLOPT_HEADERFUNCTION:只接收头数据 size_t function( void *ptr, size_t size, size_t nmemb, void *userdata);
CURLOPT_DEBUGFUNCTION: int curl_debug_callback (CURL *, curl_infotype, char *, size_t, void *);
CURLOPT_VERBOSE: 参数设置为1 能显示更多详细信息
CURLOPT_HEADER: 设为 1 将在返回的文本中包含头信息
CURLOPT_NOSIGNAL: 不超时
CURLOPT_FOLLOWLOCATION:设置为1告诉libcurl遵循任何访问
CURLOPT_MAXREDIRS: 设定重定向的数目限制,设置为-1表示无限的重定向（默认）
CURLOPT_PUT:数据上载相关
CURLOPT_POST:
CURLOPT_POSTREDIR:
CURLOPT_POSTFIELDS:
CURLOPT_POSTFIELDSIZE:
CURLOPT_POSTFIELDSIZE_LARGE:
CURLOPT_COPYPOSTFIELDS:
CURLOPT_HTTPPOST:
CURLOPT_UPLOAD:
CURLOPT_AUTOREFERER:libcurl自动设置Referer
CURLOPT_REFERER: 伪造来源路径
CURLOPT_USERAGENT:自定义USERAGENT
CURLOPT_HTTPHEADER:自定义头
CURLOPT_COOKIE: "name1=content1; name2=content2;"
CURLOPT_COOKIEFILE:
CURLOPT_COOKIEJAR:
CURLOPT_COOKIESESSION: 默认情况下，libcurl始终加载和存储所有Cookie
CURLOPT_COOKIELIST
CURLOPT_HTTPGET
CURLOPT_HTTP_VERSION: CURL_HTTP_VERSION_NONE,CURL_HTTP_VERSION_1_0,CURL_HTTP_VERSION_1_1
CURLOPT_IGNORE_CONTENT_LENGTH:忽略内容长度头,针对类似Apache 1.x的服务器
CURLOPT_HTTP_TRANSFER_DECODING:告诉libcurl如何对传输解码,(0,=1)
CURLOPT_HTTP200ALIASES:自定义HTTP 200响应别名,有些服务器对200返回不是标准的
CURLOPT_ENCODING:设置接收的内容编码,同 Accept-Encoding, ('','gzip',....)
CURLOPT_UNRESTRICTED_AUTH:数设置为1,继续发送认证（用户+密码）
NETWORK OPTIONS
CURLOPT_URL: http://xxxx,ftp://xxxx
CURLOPT_PROXY:HTTP代理,主机名或IP地址
CURLOPT_PROXYPORT:代理端口,也可在PROXY的地址后加":端口",如 :8080
CURLOPT_PROXYTYPE:代理类型,CURLPROXY_HTTP(默认), CURLPROXY_HTTP_1_0,CURLPROXY_SOCKS4,CURLPROXY_SOCKS5,CURLPROXY_SOCKS4A,CURLPROXY_SOCKS5_HOSTNAME,
CURLOPT_NOPROXY:不使用代理的域
CURLOPT_HTTPPROXYTUNNEL:
CURLOPT_BUFFERSIZE: libcurl的缓冲区大小（以字节为单位）
（认证）
CURLOPT_NETRC: 此参数控制你的密码,CURL_NETRC_OPTIONAL使用 ~/.netrc 文件, CURL_NETRC_IGNORED(默认):忽略文件,CURL_NETRC_REQUIRED:告诉该文件的使用所需的库，要忽略的URL信息
CURLOPT_NETRC_FILE: 指定 ~/.netrc 文件
CURLOPT_USERNAME:
CURLOPT_USERPWD:
CURLOPT_PASSWORD:
CURLOPT_PROXYUSERNAME:
CURLOPT_PROXYUSERPWD:
CURLOPT_HTTPAUTH:
CURLOPT_PROXYAUTH:

CURLAUTH_BASIC: HTTP基本验证
CURLAUTH_DIGEST: HTTP摘要身份验证
CURLAUTH_DIGEST_IE:
CURLAUTH_GSSNEGOTIATE: Kerberos5认证要建立GSS - API
CURLAUTH_NTLM: NTLM身份验证
CURLAUTH_ANY: 设置所有选项,ibcurl自动选择一个它认为合适的,安全的验证<
CURLAUTH_ANYSAFE: 设置基本选项....
CURLAUTH_ONLY: 强制所有请求使用验证

getinfo

CURLINFO_RESPONSE_CODE: 获得最后收到的HTTP或FTP的代码,如200,404,403,505 代理的CONNECT响应要参考 CURLINFO_HTTP_CONNECTCODE
CURLINFO_EFFECTIVE_URL: 最后一次使用有效的URL
CURLINFO_HTTP_CONNECTCODE : 长期接受最后收到的代理响应代码
CURLINFO_FILETIME:
CURLINFO_TOTAL_TIME:
CURLINFO_CONNECT_TIME:
CURLINFO_NUM_CONNECTS: 多少个连接
CURLINFO_CONTENT_TYPE: 例：text/html
CURLINFO_REQUEST_SIZE:
CURLINFO_HEADER_SIZE:
CURLINFO_SIZE_DOWNLOAD: 下载总字节量
CURLINFO_SIZE_UPLOAD:
CURLINFO_HTTPAUTH_AVAIL: 接收掩码表明身份验证
CURLINFO_PROXYAUTH_AVAIL: 接收掩码表明代理身份验证
CURLINFO_COOKIELIST:
部份使用 INFO_ 如：INFO_COOKIELIST

一个粗糙的共用对象的采集示例

Python

01

import pycurl

02

import StringIO

03

import string

04

import random

05

class spider:

06

    def __init__(self,addHeader = []):

07

        self.httpheader = [

08

             'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'

09

            #,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'

10

        ] + addHeader

11

        self.curl = pycurl.Curl()

12

        self.curl.setopt(pycurl.HTTPHEADER, self.httpheader)

13

        self.curl.setopt(pycurl.REFERER, 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+self.rand_str())

14

        #self.curl.setopt(pycurl.AUTOREFERER, 1)

15

        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)

16

        self.curl.setopt(pycurl.MAXREDIRS, 5)

17
 
18

    def __del__(self):

19

        pass

20
 
21

    def rand_str(self):

22

        return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))

23
 
24

    def tofile(self,url,filename):

25

        fp = open(filename, 'w');

26

        self.curl.setopt(pycurl.URL, url)

27

        self.curl.setopt(pycurl.WRITEFUNCTION, fp.write)

28

        self.curl.perform()

29

        fp.close()

30

        return True

31
 
32

    def html(self, url):

33

        sio = StringIO.StringIO()

34

        self.curl.setopt(pycurl.URL, url)

35

        self.curl.setopt(pycurl.WRITEFUNCTION, sio.write)

36

        self.curl.perform()

37

        reval = sio.getvalue()

38

        sio.close()

39

        return reval

40
 
41

if __name__ == "__main__":

42

    get = spider(['USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'])

43

    print get.html("http://localhost/spider_for_test.php")

44

    print get.tofile("http://localhost/spider_for_test.php",r'E:\WebSite\wwwroot\test.txt')

一个多线程的采集示例

Python

01

import pycurl

02

import threading

03

import StringIO

04

import string

05

import random

06

class spider:

07

    def __init__(self,referer='',httpheader = []):

08

        self.httpheader = [

09

             'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'

10

            ,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'

11

        ] + httpheader

12

        self.referer = referer

13

    def __del__(self):

14

        pass

15
 
16

    def fetch(self,url,stream):

17

        curl = pycurl.Curl()

18

        curl.setopt(pycurl.HTTPHEADER, self.httpheader)

19

        if self.referer == '':

20

            curl.setopt(pycurl.AUTOREFERER, 1)

21

        else:

22

            curl.setopt(pycurl.REFERER, self.referer)

23

        curl.setopt(pycurl.FOLLOWLOCATION, 1)

24

        curl.setopt(pycurl.MAXREDIRS, 5)

25

        curl.setopt(pycurl.URL, url)

26

        curl.setopt(pycurl.WRITEFUNCTION, stream.write)

27

        curl.perform()

28

        curl.close()

29
 
30

    def rand_str(self):

31

        return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))

32
 
33

    def tofile(self,url,filename):

34

        fp = open(filename, 'w');

35

        self.fetch(url,fp)

36

        fp.close()

37

        return True

38
 
39

    def html(self, url):

40

        sio = StringIO.StringIO()

41

        self.fetch(url,sio)

42

        reval = sio.getvalue()

43

        sio.close()

44

        return reval

45
 
46

def gethtml(url,get):

47

    print get.html(url)

48
 
49

if __name__ == "__main__":

50

    import time,datetime

51

    dstart = datetime.datetime.now()

52

    get = spider()

53

    get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()

54

    thread_pool = []

55

    acc = Account(100)

56

    for i in range(10):

57

        url = "http://localhost/test.php?n="+str(i)

58

        th = threading.Thread(target=gethtml,args=(url,get))

59

        thread_pool.append(th)

60

    for i in range(10):

61

        thread_pool[i].start()

62

    for i in range(10):

63

        threading.Thread.join(thread_pool[i])

64

    dend = datetime.datetime.now()

65

    print "Time span:" , dend-dstart;

WDPYSPIDER类（支持，多线程，代理，登陆验证,POST）

Python

001
#coding:utf-8

002

import pycurl

003

import urllib

004

import threading

005

import StringIO

006

import string

007

import random

008

class spider:

009

    '''WDPYSPIDER(Whiledo Python Spider Class) 采集类

010
 
011

    @author HzqGhost admin@whiledo.com QQ:313143468

012

    get = spider()

013

    get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()

014

    get.proxyuse = True

015

    get.proxyip = ['059148233056.ctinets.com:80']

016

    url = "http://www.whiledo.com"

017

    print get.html(url=url)'''

018

    def __init__(self):

019

        self.httpheader = [

020

             'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'

021

            ,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'

022

        ] #http头信息

023

        self.referer = '' #伪造来源路径

024

        self.connnecttimeout = 60 #获取联接超时(秒)

025

        self.timeout = 300 #读定超时(秒)

026

        self.backheader = 0 #是否返回服务器http头信息(一般用于测试)

027

        self.cookesfile = "./cookes.dat" #cookesfile 自动读写处理文件

028

        self.proxyuse = False  #是否使用代理服务器

029

        self.proxyip = []  #代理服务器[IP:PORT]列表，随机使用列表中的IP

030

        self.proxynodomain = ['localhost','127.0.0.1']  #不使用代理服务器的域

031

        self.http200alias = [] #200返回信息别名列表

032

        self.error = 'WDPYERROR' #非200状态时返回的错误标识

033

    def __del__(self):

034

        pass

035
 
036

    def fetch(self,url,stream, post={}):

037

        '''

038

        --url

039

        --stream [stream] StringIO or fp

040

        --post [dict] {'username':'hzq','password':'blog'}'''

041

        curl = pycurl.Curl()

042

        curl.setopt(pycurl.CONNECTTIMEOUT, self.connnecttimeout)

043

        curl.setopt(pycurl.TIMEOUT, self.timeout)

044

        curl.setopt(pycurl.HTTPHEADER, self.httpheader)

045

        curl.setopt(pycurl.HTTP200ALIASES, self.http200alias)

046

        curl.setopt(pycurl.HEADER, self.backheader)

047

        curl.setopt(pycurl.FOLLOWLOCATION, 1)

048

        curl.setopt(pycurl.MAXREDIRS, 5)

049

        if self.referer == '':

050

            curl.setopt(pycurl.AUTOREFERER, 1)

051

        else:

052

            curl.setopt(pycurl.REFERER, self.referer)

053

        curl.setopt(pycurl.COOKIEJAR, self.cookesfile)

054

        curl.setopt(pycurl.COOKIEFILE, self.cookesfile)

055

        curl.setopt(pycurl.WRITEFUNCTION, stream.write)

056

        curl.setopt(pycurl.URL, url)

057

        if self.proxyuse:

058

            proxyip = self.proxyip[random.randint(0,len(self.proxyip)-1)];

059

            curl.setopt(pycurl.PROXY, proxyip)

060

            #curl.setopt(pycurl.PROXYNO, self.proxynodomain) #需要7.19.4 的pycurl版本

061

        if len(post)>0 :

062

            curl.setopt(pycurl.POSTFIELDS, post)

063

        status = ''

064

        try:

065

            curl.perform()

066

            status = curl.getinfo(pycurl.RESPONSE_CODE)

067

        except:

068

            status = curl.errstr()

069

        finally:

070

            curl.close()

071

            status = str(status);

072

            if status != '200':

073

                status = self.error

074

            return status;

075
 
076

    def rand_str(self):

077

        return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))

078
 
079

    def tofile(self,url,filename, post={}):

080

        fp = open(filename, 'wb');

081

        self.fetch(url,fp,post)

082

        fp.close()

083

        return True

084
 
085

    def html(self, url, post={}):

086

        sio = StringIO.StringIO()

087

        reval = self.fetch(url,sio, post)

088

        if reval == '200':

089

            reval = sio.getvalue()

090

        sio.close()

091

        return reval

092
 
093

def gethtml(url,get):

094

    print get.html(url)

095
 
096

if __name__ == "__main__":

097

    get = spider()

098

    get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()

099

    get.proxyuse = True

100

    get.proxyip = ['059148233056.ctinets.com:80']

101

    url = "http://www.whiledo.com"

102

    print get.html(url=url)

http://www.cnblogs.com/huangcong/
本文版权归作者和博客园共有，欢迎转载，但未经作者同意必须保留此段声明，且在文章页面明显位置给出原文连接，否则保留追究法律责任的权利。

Python

1

import urllib

2

urlItem = urllib.urlopen("http://www.baidu.com")

3

htmSource = urlItem.read()

4
urlItem.close()

5

print htmSource

pycurl

http://pycurl.sourceforge.net/download/

http://pycurl.sourceforge.net/doc/curlobject.html

Python

01

import pycurl

02

c = pycurl.Curl()

03

c.setopt(pycurl.URL, "http://www.whiledo.com/")

04

c.setopt(pycurl.HTTPHEADER, ["Accept:"])

05

import StringIO

06

b = StringIO.StringIO()

07
c.setopt(pycurl.WRITEFUNCTION, b.write)

08

c.setopt(pycurl.FOLLOWLOCATION, 1)

09

c.setopt(pycurl.MAXREDIRS, 5)

10
c.perform()

11

print b.getvalue()

12

print c.getinfo(pycurl.INFO_FILETIME)

curl_easy_setopt

告诉 libcurl 的如何做事
CURLOPT_WRITEFUNCTION: 写(下载)回传函数,传递一个写指针供外部操作, 一次回调内容大小在 CURL_MAX_WRITE_SIZE (curl.h头文件)中设置
CURLOPT_WRITEDATA: 直接写文件,指定一个文件名如c.setopt(pycurl.WRITEDATA, 'E:\WebSite\py\1.txt') 注win下不能用
CURLOPT_READFUNCTION: 读(上传)回传函数
CURLOPT_SEEKFUNCTION: 数据指针移动,int function(void *instream, curl_off_t offset, int origin);SEEK_SET, SEEK_CUR and SEEK_END,返回CURL_SEEKFUNC_OK或CURL_SEEKFUNC_FAIL或CURL_SEEKFUNC_CANTSEEK (0,1,2)
CURLOPT_OPENSOCKETFUNCTION:
CURLOPT_HEADERFUNCTION:只接收头数据 size_t function( void *ptr, size_t size, size_t nmemb, void *userdata);
CURLOPT_DEBUGFUNCTION: int curl_debug_callback (CURL *, curl_infotype, char *, size_t, void *);
CURLOPT_VERBOSE: 参数设置为1 能显示更多详细信息
CURLOPT_HEADER: 设为 1 将在返回的文本中包含头信息
CURLOPT_NOSIGNAL: 不超时
CURLOPT_FOLLOWLOCATION:设置为1告诉libcurl遵循任何访问
CURLOPT_MAXREDIRS: 设定重定向的数目限制,设置为-1表示无限的重定向（默认）
CURLOPT_PUT:数据上载相关
CURLOPT_POST:
CURLOPT_POSTREDIR:
CURLOPT_POSTFIELDS:
CURLOPT_POSTFIELDSIZE:
CURLOPT_POSTFIELDSIZE_LARGE:
CURLOPT_COPYPOSTFIELDS:
CURLOPT_HTTPPOST:
CURLOPT_UPLOAD:
CURLOPT_AUTOREFERER:libcurl自动设置Referer
CURLOPT_REFERER: 伪造来源路径
CURLOPT_USERAGENT:自定义USERAGENT
CURLOPT_HTTPHEADER:自定义头
CURLOPT_COOKIE: "name1=content1; name2=content2;"
CURLOPT_COOKIEFILE:
CURLOPT_COOKIEJAR:
CURLOPT_COOKIESESSION: 默认情况下，libcurl始终加载和存储所有Cookie
CURLOPT_COOKIELIST
CURLOPT_HTTPGET
CURLOPT_HTTP_VERSION: CURL_HTTP_VERSION_NONE,CURL_HTTP_VERSION_1_0,CURL_HTTP_VERSION_1_1
CURLOPT_IGNORE_CONTENT_LENGTH:忽略内容长度头,针对类似Apache 1.x的服务器
CURLOPT_HTTP_TRANSFER_DECODING:告诉libcurl如何对传输解码,(0,=1)
CURLOPT_HTTP200ALIASES:自定义HTTP 200响应别名,有些服务器对200返回不是标准的
CURLOPT_ENCODING:设置接收的内容编码,同 Accept-Encoding, ('','gzip',....)
CURLOPT_UNRESTRICTED_AUTH:数设置为1,继续发送认证（用户+密码）
NETWORK OPTIONS
CURLOPT_URL: http://xxxx,ftp://xxxx
CURLOPT_PROXY:HTTP代理,主机名或IP地址
CURLOPT_PROXYPORT:代理端口,也可在PROXY的地址后加":端口",如 :8080
CURLOPT_PROXYTYPE:代理类型,CURLPROXY_HTTP(默认), CURLPROXY_HTTP_1_0,CURLPROXY_SOCKS4,CURLPROXY_SOCKS5,CURLPROXY_SOCKS4A,CURLPROXY_SOCKS5_HOSTNAME,
CURLOPT_NOPROXY:不使用代理的域
CURLOPT_HTTPPROXYTUNNEL:
CURLOPT_BUFFERSIZE: libcurl的缓冲区大小（以字节为单位）
（认证）
CURLOPT_NETRC: 此参数控制你的密码,CURL_NETRC_OPTIONAL使用 ~/.netrc 文件, CURL_NETRC_IGNORED(默认):忽略文件,CURL_NETRC_REQUIRED:告诉该文件的使用所需的库，要忽略的URL信息
CURLOPT_NETRC_FILE: 指定 ~/.netrc 文件
CURLOPT_USERNAME:
CURLOPT_USERPWD:
CURLOPT_PASSWORD:
CURLOPT_PROXYUSERNAME:
CURLOPT_PROXYUSERPWD:
CURLOPT_HTTPAUTH:
CURLOPT_PROXYAUTH:

CURLAUTH_BASIC: HTTP基本验证
CURLAUTH_DIGEST: HTTP摘要身份验证
CURLAUTH_DIGEST_IE:
CURLAUTH_GSSNEGOTIATE: Kerberos5认证要建立GSS - API
CURLAUTH_NTLM: NTLM身份验证
CURLAUTH_ANY: 设置所有选项,ibcurl自动选择一个它认为合适的,安全的验证<
CURLAUTH_ANYSAFE: 设置基本选项....
CURLAUTH_ONLY: 强制所有请求使用验证

getinfo

CURLINFO_RESPONSE_CODE: 获得最后收到的HTTP或FTP的代码,如200,404,403,505 代理的CONNECT响应要参考 CURLINFO_HTTP_CONNECTCODE
CURLINFO_EFFECTIVE_URL: 最后一次使用有效的URL
CURLINFO_HTTP_CONNECTCODE : 长期接受最后收到的代理响应代码
CURLINFO_FILETIME:
CURLINFO_TOTAL_TIME:
CURLINFO_CONNECT_TIME:
CURLINFO_NUM_CONNECTS: 多少个连接
CURLINFO_CONTENT_TYPE: 例：text/html
CURLINFO_REQUEST_SIZE:
CURLINFO_HEADER_SIZE:
CURLINFO_SIZE_DOWNLOAD: 下载总字节量
CURLINFO_SIZE_UPLOAD:
CURLINFO_HTTPAUTH_AVAIL: 接收掩码表明身份验证
CURLINFO_PROXYAUTH_AVAIL: 接收掩码表明代理身份验证
CURLINFO_COOKIELIST:
部份使用 INFO_ 如：INFO_COOKIELIST

一个粗糙的共用对象的采集示例

Python

01

import pycurl

02

import StringIO

03

import string

04

import random

05

class spider:

06

    def __init__(self,addHeader = []):

07

        self.httpheader = [

08

             'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'

09

            #,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'

10

        ] + addHeader

11

        self.curl = pycurl.Curl()

12

        self.curl.setopt(pycurl.HTTPHEADER, self.httpheader)

13

        self.curl.setopt(pycurl.REFERER, 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+self.rand_str())

14

        #self.curl.setopt(pycurl.AUTOREFERER, 1)

15

        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)

16

        self.curl.setopt(pycurl.MAXREDIRS, 5)

17
 
18

    def __del__(self):

19

        pass

20
 
21

    def rand_str(self):

22

        return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))

23
 
24

    def tofile(self,url,filename):

25

        fp = open(filename, 'w');

26

        self.curl.setopt(pycurl.URL, url)

27

        self.curl.setopt(pycurl.WRITEFUNCTION, fp.write)

28

        self.curl.perform()

29

        fp.close()

30

        return True

31
 
32

    def html(self, url):

33

        sio = StringIO.StringIO()

34

        self.curl.setopt(pycurl.URL, url)

35

        self.curl.setopt(pycurl.WRITEFUNCTION, sio.write)

36

        self.curl.perform()

37

        reval = sio.getvalue()

38

        sio.close()

39

        return reval

40
 
41

if __name__ == "__main__":

42

    get = spider(['USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'])

43

    print get.html("http://localhost/spider_for_test.php")

44

    print get.tofile("http://localhost/spider_for_test.php",r'E:\WebSite\wwwroot\test.txt')

一个多线程的采集示例

Python

01

import pycurl

02

import threading

03

import StringIO

04

import string

05

import random

06

class spider:

07

    def __init__(self,referer='',httpheader = []):

08

        self.httpheader = [

09

             'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'

10

            ,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'

11

        ] + httpheader

12

        self.referer = referer

13

    def __del__(self):

14

        pass

15
 
16

    def fetch(self,url,stream):

17

        curl = pycurl.Curl()

18

        curl.setopt(pycurl.HTTPHEADER, self.httpheader)

19

        if self.referer == '':

20

            curl.setopt(pycurl.AUTOREFERER, 1)

21

        else:

22

            curl.setopt(pycurl.REFERER, self.referer)

23

        curl.setopt(pycurl.FOLLOWLOCATION, 1)

24

        curl.setopt(pycurl.MAXREDIRS, 5)

25

        curl.setopt(pycurl.URL, url)

26

        curl.setopt(pycurl.WRITEFUNCTION, stream.write)

27

        curl.perform()

28

        curl.close()

29
 
30

    def rand_str(self):

31

        return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))

32
 
33

    def tofile(self,url,filename):

34

        fp = open(filename, 'w');

35

        self.fetch(url,fp)

36

        fp.close()

37

        return True

38
 
39

    def html(self, url):

40

        sio = StringIO.StringIO()

41

        self.fetch(url,sio)

42

        reval = sio.getvalue()

43

        sio.close()

44

        return reval

45
 
46

def gethtml(url,get):

47

    print get.html(url)

48
 
49

if __name__ == "__main__":

50

    import time,datetime

51

    dstart = datetime.datetime.now()

52

    get = spider()

53

    get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()

54

    thread_pool = []

55

    acc = Account(100)

56

    for i in range(10):

57

        url = "http://localhost/test.php?n="+str(i)

58

        th = threading.Thread(target=gethtml,args=(url,get))

59

        thread_pool.append(th)

60

    for i in range(10):

61

        thread_pool[i].start()

62

    for i in range(10):

63

        threading.Thread.join(thread_pool[i])

64

    dend = datetime.datetime.now()

65

    print "Time span:" , dend-dstart;

WDPYSPIDER类（支持，多线程，代理，登陆验证,POST）

Python

001
#coding:utf-8

002

import pycurl

003

import urllib

004

import threading

005

import StringIO

006

import string

007

import random

008

class spider:

009

    '''WDPYSPIDER(Whiledo Python Spider Class) 采集类

010
 
011

    @author HzqGhost admin@whiledo.com QQ:313143468

012

    get = spider()

013

    get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()

014

    get.proxyuse = True

015

    get.proxyip = ['059148233056.ctinets.com:80']

016

    url = "http://www.whiledo.com"

017

    print get.html(url=url)'''

018

    def __init__(self):

019

        self.httpheader = [

020

             'Accept:application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5'

021

            ,'USER_AGENT:Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)'

022

        ] #http头信息

023

        self.referer = '' #伪造来源路径

024

        self.connnecttimeout = 60 #获取联接超时(秒)

025

        self.timeout = 300 #读定超时(秒)

026

        self.backheader = 0 #是否返回服务器http头信息(一般用于测试)

027

        self.cookesfile = "./cookes.dat" #cookesfile 自动读写处理文件

028

        self.proxyuse = False  #是否使用代理服务器

029

        self.proxyip = []  #代理服务器[IP:PORT]列表，随机使用列表中的IP

030

        self.proxynodomain = ['localhost','127.0.0.1']  #不使用代理服务器的域

031

        self.http200alias = [] #200返回信息别名列表

032

        self.error = 'WDPYERROR' #非200状态时返回的错误标识

033

    def __del__(self):

034

        pass

035
 
036

    def fetch(self,url,stream, post={}):

037

        '''

038

        --url

039

        --stream [stream] StringIO or fp

040

        --post [dict] {'username':'hzq','password':'blog'}'''

041

        curl = pycurl.Curl()

042

        curl.setopt(pycurl.CONNECTTIMEOUT, self.connnecttimeout)

043

        curl.setopt(pycurl.TIMEOUT, self.timeout)

044

        curl.setopt(pycurl.HTTPHEADER, self.httpheader)

045

        curl.setopt(pycurl.HTTP200ALIASES, self.http200alias)

046

        curl.setopt(pycurl.HEADER, self.backheader)

047

        curl.setopt(pycurl.FOLLOWLOCATION, 1)

048

        curl.setopt(pycurl.MAXREDIRS, 5)

049

        if self.referer == '':

050

            curl.setopt(pycurl.AUTOREFERER, 1)

051

        else:

052

            curl.setopt(pycurl.REFERER, self.referer)

053

        curl.setopt(pycurl.COOKIEJAR, self.cookesfile)

054

        curl.setopt(pycurl.COOKIEFILE, self.cookesfile)

055

        curl.setopt(pycurl.WRITEFUNCTION, stream.write)

056

        curl.setopt(pycurl.URL, url)

057

        if self.proxyuse:

058

            proxyip = self.proxyip[random.randint(0,len(self.proxyip)-1)];

059

            curl.setopt(pycurl.PROXY, proxyip)

060

            #curl.setopt(pycurl.PROXYNO, self.proxynodomain) #需要7.19.4 的pycurl版本

061

        if len(post)>0 :

062

            curl.setopt(pycurl.POSTFIELDS, post)

063

        status = ''

064

        try:

065

            curl.perform()

066

            status = curl.getinfo(pycurl.RESPONSE_CODE)

067

        except:

068

            status = curl.errstr()

069

        finally:

070

            curl.close()

071

            status = str(status);

072

            if status != '200':

073

                status = self.error

074

            return status;

075
 
076

    def rand_str(self):

077

        return ''.join(random.sample(['a','b','c','d','e','f','g','h','i','j','k','l','m','n'], 6))

078
 
079

    def tofile(self,url,filename, post={}):

080

        fp = open(filename, 'wb');

081

        self.fetch(url,fp,post)

082

        fp.close()

083

        return True

084
 
085

    def html(self, url, post={}):

086

        sio = StringIO.StringIO()

087

        reval = self.fetch(url,sio, post)

088

        if reval == '200':

089

            reval = sio.getvalue()

090

        sio.close()

091

        return reval

092
 
093

def gethtml(url,get):

094

    print get.html(url)

095
 
096

if __name__ == "__main__":

097

    get = spider()

098

    get.referer = 'http://www.google.com/search?sourceid=chrome&ie=UTF-8&q='+get.rand_str()

099

    get.proxyuse = True

100

    get.proxyip = ['059148233056.ctinets.com:80']

101

    url = "http://www.whiledo.com"

102

    print get.html(url=url)