单线程版:

 1 import  urllib.request
 2 import urllib.parse
 3 import urllib.error
 4 import re,time
 5 headers = ("User-Agent",
 6            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
 7 operner = urllib.request.build_opener()
 8 operner.addheaders = [headers]
 9 urllib.request.install_opener(operner)
10  
11 list_url = []
12  
13  
14 ###使用代理获取网页url内容
15 def use_proxy(url):
16     try:
17         # proxy = urllib.request.ProxyHandler({'http':proxy_addr})    ##使用代理版
18         # operner = urllib.request.build_opener()
19         # urllib.request.install_opener(operner)
20         headers = ("User-Agent",
21                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3107.4 Safari/537.36")
22         operner = urllib.request.build_opener()
23         operner.addheaders = [headers]
24         urllib.request.install_opener(operner)
25         data = urllib.request.urlopen(url).read().decode('utf-8')
26         # print (data)
27         return data
28     except urllib.error.URLError as e:
29         if hasattr(e, "code"):
30             print(e.code)
31         elif hasattr(e, "reason"):
32             print(e.reason)
33  
34     except Exception as e:
35         print("exception" + str(e))
36         time.sleep(1)
37  
38 ##获取要爬取的url
39 def get_url(key, pagestart, pageend):
40     try:
41  
42         keycode = urllib.parse.quote(key)
43  
44         for page in range(pagestart, pageend + 1):
45             url = "http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=%d&page=1&ie=utf8" % (
46             keycode, page)
47             data1 = use_proxy(url)
48             #print("data1的内容是", data1)
49             listurl_pattern = '<h3>.*?("http://.*?)</h3>'
50             result = re.compile(listurl_pattern, re.S).findall(data1)
51             for i in range(len(result)):
52                 res = result[i].replace("amp;", "").split(" ")[0].replace("\"", "")
53                 list_url.append(res)
54         #print(list_url)
55         return list_url
56     except urllib.error.URLError as e:
57         if hasattr(e, "code"):
58             print(e.code)
59         elif hasattr(e, "reason"):
60             print(e.reason)
61     except Exception as e:
62         print("exception:", e)
63  
64 ##通过获取的url爬行内容数据并处理
65 def get_url_content(list_url):
66     fh1=open("D:\\python-script\\1.html", 'wb')
67     html1 = '''<!DOCTYPE html>\n<html xmlns="http://www.w3.org/1999/xhmtl">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n<title>微信文章</title></head>\n<body>'''
68     fh1.write(html1.encode("utf-8"))
69     fh1.close()
70     fh = open("D:\\python-script\\1.html", 'ab')
71     for url in list_url:
72         data_content = use_proxy(url)
73         #print (data_content)
74         #sys.exit()
75         title_pattern = '<h2.*>.*?</h2>'
76         result_title = re.compile(title_pattern, re.S).findall(data_content)
77         ##标题(str)
78         res_title = result_title[0].replace("<h2 class=\"rich_media_title\" id=\"activity-name\">", "").replace("</h2>",
79                                                                                           "").strip()
80  
81         content_pattern = '>'
82         content = re.compile(content_pattern, re.S).findall(data_content)
83  
84         try:
85             fh.write(res_title.encode("utf-8"))
86             for i in content:
87                 fh.write(i.strip().encode("utf-8"))           
88         except UnicodeEncodeError as e:
89             continue
90  
91     fh.write("</body></html>".encode("utf-8"))
92  
93 if __name__ == '__main__':
94     pagestart = 1
95     pageend = 2
96     key = "人工智能"
97     get_url(key, pagestart, pageend)
98     get_url_content(list_url)
View Code

相关文章:

  • 2022-12-23
  • 2021-07-07
  • 2021-11-02
  • 2022-01-03
  • 2021-05-19
  • 2021-06-26
  • 2021-09-15
  • 2022-02-09
猜你喜欢
  • 2022-12-23
  • 2021-11-23
  • 2022-12-23
  • 2022-12-23
  • 2021-07-02
  • 2022-12-23
  • 2021-11-20
相关资源
相似解决方案