实验1:爬取京东网。
import requests
url = "http://item.jd.com/10460106645"
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[:1000])
except:
print("爬取失败")
实验二: 爬取amazon网
import requests
url = "https://www.amazon.cn/dp/B00RT6LB9W/ref=cngwdyfloorv2_recs_0?pf_rd_p=05f2b7d6-37ec-49bf-8fcf-5d2fec23a061&pf_rd_s=desktop-2&pf_rd_t=36701&pf_rd_i=desktop&pf_rd_m=A1AJ19PSB66TGU&pf_rd_r=TYTEFRZ086W1AQREBTFK&pf_rd_r=TYTEFRZ086W1AQREBTFK&pf_rd_p=05f2b7d6-37ec-49bf-8fcf-5d2fec23a061"
try:
kv = {'user-agent':'Mozilla/5.0'}
#此处把user-agent标志变更为合法的标准浏览器。因为有的网页会禁止爬虫访问。如果不修改,用r.request.headers 命令查看,则输出如下信息
#{'User-Agent': 'python-requests/2.18.4', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
# 网站可能会设置 来自python的user-agent的访问。
#当修改后,再次用r.request.headers 命令查看,则会显示如下内容
#{'user-agent': 'Mozilla/5.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
r = requests.get(url,headers = kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text)
except:
print("爬取失败")
实验三 百度360搜索关键词提交
搜索引擎关键词提交接口:
百度:http://www.baidu.com/s?wd=keyword
360: http://www.so.com/s?q=keyword
Baidu爬虫全码
import requests
keyword = "Python"
try:
kv = {'wd':keyword}
r = requests.get("http://www.baidu.com/s",params = kv)
print(r.request.url)
r.raise_for_status()
print(len(r.text))
except:
print("Failed")
//360全码
import requests
keyword = "Python"
try:
kv = {'q':keyword}
r = requests.get("http://www.so.com/s",params = kv)
print(r.request.url)
r.raise_for_status()
print(len(r.text))
except:
print("Failed")
实验4 网络图片的爬取
网络图片的链接格式: http://www.example.com/picture.jpg
import requests
path = "C:/zhj/abc.jpg"
url = "https://www.nationalgeographic.com/content/dam/travel/2018-digital/wild-wonders-of-europe/wild-wonders-of-europe-23.ngsversion.1525723673468.adapt.676.1.jpg"
r = requests.get(url)
r.status_code
with open(path,'wb') as f:
f.write(r.content)
实验五 IP地址归属地自动查询
http://m.ip138.com/ip.asp?ip-ipaddress
import requests
url = "http://m.ip138.com/ip.asp?ip="
try:
r = requests.get(url + '202.116.65.13')
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[-500:])
except:
print("Failed")