1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @Time : 2018/7/10 22:34
4 # @Author : chenxiaowei
5 # @Email : chen1020xiaowei@163.com
6 # @File : parse_meinv.py
7 ###利用正则表达式匹配字符串爬取***的美女图片,保存相关数据到MongoDB并且把相关图片保存在本地###
8 from parse_config import *
9 import requests
10 import re
11 import json
12 import time
13 import pymongo
14 from requests.exceptions import RequestException
15 import hashlib
16
17 db_client = pymongo.MongoClient(mongo_url) # 初始化MongoDB数据库对象
18 db = db_client[mongo_database] # 引用实例
19
20
21 def get_responses(url): # 定义获取response函数
22 try:
23 responses = requests.get(url, headers=headers)
24 if responses.status_code == 200: # 判断是否请求成功,利用.text方法返回html代码
25 return responses.text
26 else:
27 return None
28 except RequestException: # 捕获父类异常
29 print(\'error1\')
30 return None
31
32
33 def get_image_content(url): # 定义函数
34 try:
35 responses = requests.get(url, headers=headers)
36 if responses.status_code == 200:
37 return responses.content # 利用.content方法返回二进制文件
38 else:
39 return None
40 except RequestException:
41 print(\'error2\')
42 return None
43
44
45 def download_image(content):
46 filename = \'{0}.{1}\'.format(hashlib.md5(content).hexdigest(), \'jpg\') # 字符串的通配方法
47 with open(path_image.format(filename), \'wb\')as f: # \'wb\'保存图片
48 f.write(content)
49 print(filename,\'下载成功!\')
50 f.close()
51
52
53 def get_url_items(html):
54 pattern = re.compile(\'<li>.*?<a.*?href="(.*?)".*?class="TypeBigPics".*?src="(.*?)".*?<span>(.*?)</span>\'
55 + \'.*?class="IcoList">(.*?)</em>.*?class="IcoTime">(.*?)</em>\', re.S)
56 items = re.findall(pattern, html) # 利用re库的compile方法构造正则表达式,findall方法获取items
57 for item in items:
58 yield {
59 \'名称\': item[2],
60 \'壁纸\': item[1],
61 \'网址\': item[0],
62 \'发布日期\': item[4],
63 \'查看次数\': item[3][3:]
64 } # yield生成器,被调用时才赋值
65 content = get_image_content(item[1])
66 download_image(content)
67
68
69 def save_to_file(filename,file_type,text): # 保存至本地
70 with open(\'{}{}{}\'.format(path_txt, filename,file_type), \'a\', encoding=\'utf-8\', )as wf: # 以utf-8的编码方式追加到文件
71 wf.write(json.dumps(text, ensure_ascii=False) + \'\n\') # 解码相关json格式
72 print(text,\'写入到本地成功!\')
73 wf.close()
74
75
76 def save_to_mongo(text): # 存储到MongoDB
77 if db[mongo_table].insert(text):
78 print(text,\'写入Mongo成功!\')
79 return True
80 return False
81
82
83 def main(filename, page):
84 url = \'http://www.***/bizhitupian/meinvbizhi/{}.htm\'.format(page)
85 html = get_responses(url)
86 items = get_url_items(html)
87 for item in items:
88 save_to_file(filename,file_type, item)
89 save_to_mongo(item)
90
91
92 if __name__ == \'__main__\':
93 for page in range(start_page, end_page + 1):
94 main(file, page)
95 time.sleep(15)#等待15秒,防止被识别
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # @Time : 2018/7/10 22:35
4 # @Author : chenxiaowei
5 # @Email : chen1020xiaowei@163.com
6 # @File : parse_config.py
7 mongo_url = \'localhost\'
8 mongo_database = \'youmeiwang\'
9 mongo_table = \'meinv\'
10 headers = {
11 \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36\'
12 }
13 file = \'***美女图片.txt\'
14 path_image = \'H:/Python_download/20180710/image/{}\'
15 path_txt = \'H:/Python_download/20180710/file/\'
16 filename = \'***美女图片\'
17 file_type = \'.txt\'
18 start_page = 1
19 end_page = 44