alexkh
import urllib2
from BeautifulSoup import BeautifulSoup
import random
import time

def checkIndex(url):
    url = url.replace(\'http://\', \'\')
    baiduUrl = \'http://www.baidu.com/s?wd=\' + url
    webPage = urllib2.urlopen(baiduUrl)
    webCont = webPage.read()
    webCont = webCont.replace(\'<b>\', \'\').replace(\'</b>\', \'\')
    soup = BeautifulSoup(webCont)
    findlist = soup.find(\'span\', {\'class\': \'g\'})
    if findlist:
        for each in findlist:
            if url in unicode(each):
                return url
            else:
                return None
    else:
        return None

waittime = random.randint(1, 20)


urllist = open(\'list.txt\')
res = open(\'check.txt\', \'w\')

for eachurl in urllist.readlines():
    indexurl = unicode(checkIndex(eachurl)) + \'\n\'
    res.write(indexurl)
    time.sleep(waittime)

urllist.close()
res.close()
print \'over!\'

 

分类:

技术点:

相关文章: