# -*- coding: utf-8 -*- #coding=utf-8 import urllib import urllib2 import re import thread import time class BDTB: def __init__(self,baseurl,seeLz): self.baseUrl=baseurl self.seeLz=\'?see_lz=\'+str(seeLz) self.Tool=Tool() def getPage(self,pageNum): try: url=self.baseUrl+self.seeLz+\'&pn=\'+str(pageNum) request = urllib2.Request(url) response = urllib2.urlopen(request) return response.read() except urllib2.URLError, e: print "链接网络失败"+e.reason return None def getTitle(self): html=self.getPage(1) pattern = re.compile(\'core_title_txt pull-left text-overflow.*?>(.*?)</h3>\',re.S) result =re.search(pattern,html) if result: print result.group(1) else: return None def getContent(self,page): pattern =re.compile(\'<div id="post_content_.*?>(.*?)</div>\',re.S) items = re.findall(pattern,page) floor=1 for i in items: print floor,u\'楼--------------------------------------------\n\' print self.Tool.replace(i) floor+=1 class Tool:
#去除img标签,1-7位空格,
removeImg = re.compile(\'<img.*?>| {1,7}| \')
#删除超链接标签
removeAddr = re.compile(\'<a.*?>|</a>\')
#把换行的标签换为\n
replaceLine = re.compile(\'<tr>|<div>|</div>|</p>\')
#将表格制表<td>替换为\t
replaceTD= re.compile(\'<td>\')
#将换行符或双换行符替换为\n
replaceBR = re.compile(\'<br><br>|<br>\')
#将其余标签剔除
removeExtraTag = re.compile(\'<.*?>\')
#将多行空行删除
removeNoneLine = re.compile(\'\n+\')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"\n",x)
x = re.sub(self.replaceTD,"\t",x)
x = re.sub(self.replaceBR,"\n",x)
x = re.sub(self.removeExtraTag,"",x)
x = re.sub(self.removeNoneLine,"\n",x)
#strip()将前后多余内容删除
return x.strip()
baseURL = \'http://tieba.baidu.com/p/3138733512\' bdtb = BDTB(baseURL,2) bdtb.getContent(bdtb.getPage(2))