Python-爬虫初学

   #爬取网站中的图片
 1 import re     #正则表达式库
 2 import urllib #url链接库
 3 
 4 def getHtml(url):
 5     page = urllib.urlopen(url) #打开链接
 6     html = page.read()         #像读文本一样读取网页内容
 7     return html
 8 
 9 def getImg(html):
10     reg = r'<img src="(.+?\.png)" alt'   #匹配表达式
11     imgre = re.compile(reg)              #编译成正则表达式对象
12     imglist =re.findall(imgre, html)     #查找全部满足匹配的
13     x = 0
14     for imgurl in imglist:
15         print "imgurl:", imgurl
16         urllib.urlretrieve("http://www.uestc.edu.cn/" + imgurl, '%d.png' % x)  #依次遍历下载，源链接用的是相对地址，所以添加前缀
17         x += 1
18     
19 html = getHtml("http://www.uestc.edu.cn/")
20 print getImg(html)
21 #print html
参考学习链接：
http://www.cnblogs.com/fnng/p/3576154.html