【发布时间】:2012-04-16 12:25:37
【问题描述】:
下面是我的 twitter 爬虫机制的一小段代码:
from BeautifulSoup import BeautifulSoup
import re
import urllib2
url = 'http://mobile.twitter.com/NYTimesKrugman'
def gettweets(soup):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
print tag.renderContents()
print ('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more'):
return True
else:
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def checkforstamp(soup): # the parser scans a webpage to check if any of the tweets are older than 3 months
times = soup.findAll('a', {'href': True}, {'class': 'status_link'})
for time in times:
stamp = time.renderContents()
test_stamp = str(stamp)
if test_stamp == '3 months ago':
print test_stamp
return True
else:
return False
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
print 'stamp' + str(stamp)
print 'tweets' +str (tweets)
while (stamp is False) and (tweets is True):
b = getnewlink(soup)
print b
red = urllib2.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
print 'done'
问题是,在我的推特爬虫点击大约 3 个月的推文后,我希望它停止转到用户的下一页。但是,它似乎没有这样做。它似乎在不断地搜索下一页的推文。我相信这是由于 checkstamp 不断评估为 False 的事实。有没有人对我如何修改代码有任何建议,以便爬虫只要有更多推文(由 are_more_tweets 机制验证)并且还没有达到 3 个月的推文,就可以继续寻找下一页的推文? ??谢谢!
编辑 - 请参见下文:
from BeautifulSoup import BeautifulSoup
import re
import urllib
url = 'http://mobile.twitter.com/cleversallie'
output = open(r'C:\Python28\testrecursion.txt', 'a')
def gettweets(soup):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
print(b)
print('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more'):
return True
else:
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def checkforstamp(soup): # the parser scans a webpage to check if any of the tweets are older than 3 months
times = soup.findAll('a', {'href': True}, {'class': 'status_link'})
for time in times:
stamp = time.renderContents()
test_stamp = str(stamp)
if not (test_stamp[0]) in '0123456789':
continue
if test_stamp == '3 months ago':
print test_stamp
return True
else:
return False
response = urllib.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
while (not stamp) and (tweets):
b = getnewlink(soup)
print b
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
print 'done'
【问题讨论】:
-
正如我在您上一个问题中所说,
print test_stamp之前if声明显示了什么?它是否曾经显示3 months ago或类似的东西?另外,由于我正确回答了您的最后一个问题,请接受该答案。 -
嗨,我很抱歉。我认为上一篇文章有点混乱,所以我想在一个新问题中澄清一下。你最后的回答确实回答了我的问题。但是,它也暴露了我的代码的另一个缺陷,所以我认为这将有助于创建一个新帖子。
-
呃。没有足够的空间发表评论。您是否建议我将 print test_stamp 移到 if 语句之前,看看它是否能找到 3 个月前所说的内容?我假设是这样 - 所以我正在尝试测试它。
-
好的,所以我按照你的建议做了 - 并找到了这个。这是我的程序正在使用的假定 test_stamp 的示例:a2.twimg.com/twitter-mobile/…" />
-
开始一个新问题没有错,这是你应该做的。答案如下。
标签: python html twitter web-crawler