思路:

  先将需要获取的匹配出,然后可以用"永真"(即while True:)来遍历使得URL可以一直自增变化(百度点击下一页URL的pn参数就增加10)每增加10就爬行一遍URL然后提取一次数据。

 

#-*-coding:UTF-8-*-

import sys,re,requests,graphics,Tkinter
import easygui as gui

string = raw_input("string is :")
pn = 0
while True:
    url = "http://www.baidu.com/s?wd=%s&pn=%d" % (string, pn)
    pn += 10
    html = requests.get(url).text
    # html = """
    # <div class="c-tools" ></i></a></div>
    # """
    res = "<div .*? data-tools=(.*?)>.*?</div>"
    con = re.findall(res, html)
    for i in con:
        d = eval(i.strip("'"))#将正则匹配到的json格式的数据转换为字典,eval即为转换。
        print "title:" + d[u'title'] + "  " + d['url']

    num = raw_input(u"e or q:")
    if num == "q":
        exit()

 后期又修改了一下.

 1 #!/usr/bin/env python
 2 #encoding:utf-8
 3 #by i3ekr
 4 
 5 import sys,re,requests,time,json
 6 print """
 7                         
 8                        #G                    
 9                        #K                    
10                       .Et                    
11                       :#                     
12                     : ##                     
13                     ##Dj K                   
14                    .####G###                 
15                    E;#####f;                 
16                     ########                 
17                     #######.                 
18                     .i#L#,t                  
19                     DEDECMS               
20                         
21 """
22 string = raw_input("string is :")
23 pn = 0
24 nn = 0
25 r = requests.session()
26 head = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'}
27 while True:
28     url = "http://www.baidu.com/s?wd=%s&pn=%d" % (string, pn)
29     html = r.get(url, headers=head).text
30     res = "<div .*? data-tools=(.*?)>.*?</div>"
31     con = re.findall(res, html)
32     pn += 10
33     nn += 1
34     try:
35       for i in con:
36           a = eval(eval(i))
37           b = r.get(a.get("url"), headers=head)
38           print "[%s] %s"%(nn,b.url)
39     except Exception as e:
40       pass

 

相关文章:

  • 2021-08-13
  • 2022-12-23
  • 2021-08-10
  • 2022-12-23
  • 2022-12-23
  • 2022-12-23
  • 2021-08-25
  • 2022-12-23
猜你喜欢
  • 2022-12-23
  • 2022-12-23
  • 2022-12-23
  • 2021-12-23
  • 2021-08-20
  • 2022-12-23
  • 2022-12-23
相关资源
相似解决方案