一、爬虫入门
网络爬虫(又被称为网页万维网信息的程序或者脚本。
运用python3.6中的urllib.request
1.快速爬取一个网页
(1)get请求方式
#!/usr/bin/env python # -*- coding: UTF-8 -*- # Author:Du Fei import urllib.request # keywd = "python" keywd ="百度" #解决中文编码问题 keywd=urllib.request.quote(keywd) url = "http://www.baidu.com/s?wd=" +keywd req =urllib.request.Request(url) #urlopen将网页存到内存 data =urllib.request.urlopen(req).read() fh=open("F:/python/data/douban/2.html","wb") fh.write(data) fh.close()
(2)post请求方式
#!/usr/bin/env python # -*- coding: UTF-8 -*- # Author:Du Fei #post请求 #登录模拟 import urllib.request import urllib.parse url ="http://www.iqianyue.com/mypost/" #对字段相应设置 mydata=urllib.parse.urlencode({ "name":"ceo@iqiaa.com", "pass":"123ssd" }).encode("utf-8") req =urllib.request.Request(url,mydata) data =urllib.request.urlopen(req).read() fh =open("F:/python/data/douban/2_1.html","wb") fh.write(data) fh.close()