Python开发之爬虫实战
Requests+正则表达式爬取电影
1、目标站点分析
- 抓取单页内容:利用requests请求目标站点,得到单个网页HTML代码,返回结果
- 正则表达式分析:根据HTML代码分析得到电影的名称、主演、上映时间、评分、图片链接等信息
- 保存至文件:通过文件的形式将结果保存,每一部电影一个结果一行json字符串
- 开启循环及多线程:对多页内容遍历,开启多线程提高抓取速度。
2、实战
import requests
from requests.exceptions import RequestException
from multiprocessing import Pool
import re,json
def get_one_page(url):
try:
headers = {
\'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36\'
} # 加headers非常必要,不加很可能会被禁掉
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile(\'<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a\'
+\'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>\'
+\'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>\',re.S)
items = re.findall(pattern,html)
for item in items:
yield {
\'index\':item[0],
\'image\':item[1],
\'title\':item[2],
\'actor\':item[3].strip()[3:],
\'time\':item[4].strip()[5:],
\'score\':item[5]+item[6]
}
def write_to_file(content):
with open(\'result.txt\',\'a\',encoding=\'utf8\') as f:
f.write(json.dumps(content,ensure_ascii=False) + \'\n\') # content是字典的形式,需要用json进行转换
def main(offset):
url = \'http://maoyan.com/board/4?offset=\' + str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
# print(item)
write_to_file(item) # 写到文件
if __name__ == \'__main__\':
# for i in range(10):
# main(i*10)
pool = Pool()
pool.map(main, [i*10 for i in range(10)]) # 利用多进程抓取,提高效率
抓取街拍美图实例
这个实例需要的库以及数据库有:请求库requests,解析库BeautifulSoup和正则,存储的数据库是MongoDB,需要pymongo库
1、流程框架:
- 抓取索引页内容:利用requests请求目标站点内容,得到索引网页HTML代码,返回结果
- 抓取详情页内容:解析返回结果,得到详情页的链接,并进一步抓取详情页的信息
- 下载图片与保存数据:将图片下载到本地,并把页面信息及图片URl保存到MongoDB
- 开启循环及多线程:对多页内容进行遍历,开启多线程提高抓取速度
2、实战
spider.py
import requests
from requests.exceptions import RequestException
from urllib.parse import urlencode
import json,re
from bs4 import BeautifulSoup
import pymongo
from config import *
import os,hashlib
from multiprocessing import Pool
client = pymongo.MongoClient(MONGO_URL,connect=False)
db = client[MONGO_DB]
def get_page_index(offset,keyword):
data = {
\'offset\': offset,
\'format\': \'json\',
\'keyword\': keyword,
\'autoload\': \'true\',
\'count\': 20,
\'cur_tab\': 1,
\'from\':\'search_tab\'
}
url = \'https://www.toutiao.com/search_content/?\' + urlencode(data)
headers = {
\'user-agent\':\'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36\'
}
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print(\'请求索引页失败\')
return None
def parse_page_index(html):
data = json.loads(html)
if data and \'data\' in data.keys():
for item in data.get(\'data\'):
yield item.get(\'article_url\')
def get_page_detail(url):
headers = {
\'user-agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36\'
}
try:
if url:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
print(\'详情页出错\',url)
return None
def parse_page_deatil(html,url):
soup = BeautifulSoup(html,\'lxml\')
title = soup.select(\'title\')[0].get_text()
# print(title)
images_pattern = re.compile(\'JSON.parse\("(.*?)"\)\',re.S)
result = re.search(images_pattern, html)
if result:
data = result.group(1)
data = data.replace(\'\\"\', \'"\')
data = json.loads(data)
if data and \'sub_images\' in data.keys():
sub_images = data.get(\'sub_images\')
images = [item.get(\'url\') for item in sub_images]
for image in images:download_image(image)
return {
\'title\':title,
\'url\':url,
\'images\':images
}
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print(\'成功存储到MongoDB\')
return True
return False
def download_image(url):
try:
response = requests.get(url)
if response.status_code == 200:
save_image(response.content)
return None
except RequestException:
print(\'请求图片失败\')
return None
def save_image(content):
file_path = \'{0}/{1}.{2}\'.format(os.getcwd(),hashlib.md5(content).hexdigest(),\'.jpg\') # 路径拼接
if not os.path.exists(file_path):
with open(file_path,\'wb\') as f:
f.write(content)
def main(offset):
html = get_page_index(offset,KEYWORD)
for url in parse_page_index(html):
html = get_page_detail(url)
if html:
result = parse_page_deatil(html,url)
if result:save_to_mongo(result)
if __name__ == \'__main__\':
groups = [x*20 for x in range(GROUP_START,GROUP_END + 1)]
pool = Pool()
pool.map(main,groups)
config.py:配置文件
MONGO_URL=\'localhost\' MONGO_DB=\'toutiao\' MONGO_TABLE=\'toutiao\' GROUP_START=1 GROUP_END=20 KEYWORD=\'街拍\'