【问题标题】:google api search resultsgoogle api 搜索结果
【发布时间】:2014-04-04 20:33:58
【问题描述】:

您好,我正在尝试使用谷歌应用引擎来创建一个学术项目。 我想通过google api实现的主要功能是搜索用户输入的查询并返回结果,然后我可以对结果进行进一步的分析。 xgoogle 是我的第一次尝试,但它与 google api 引擎有导入错误(我可以在本地运行相同的代码)。即使是 xgoogle 作品,我也不知道如何获取不同网站的包含。 我想知道是否有任何方法可以将谷歌搜索结果作为文档返回。谢谢。

代码部分是使用正则表达式来查找 html 代码中的包含,但它不能用于具有不同 html 布局的网站。谢谢你。

#!/usr/bin/env python
from html import HTML_PAGE
import webapp2
import jinja2
import os
import re
import sys

from os import walk
from google.appengine.api import search
from google.appengine.ext import ndb
from urllib import urlopen
from cgi import parse_qs

from xgoogle.search import GoogleSearch, SearchError


page = HTML_PAGE()

class MainPage(webapp2.RequestHandler):
    def get(self):
        self.response.out.write(page.pageChange())




class SearchFile():
    def __init__(self,userInput=''):
        self.__input = userInput
        self.__result = {}
        self.__files = []
        self.__filenames =[]
        for (dirpath, dirnames, filenames) in walk("dataFolder"):
            for name in filenames:
                path = dirpath+"/"+name
                self.__files.append(path)
                self.__filenames.append(name)
            break
    def outPutData(self):
        iterator = range(0,len(self.__filenames))
        for i in iterator:
            with open(self.__files[i]) as f:
                for line in f:
                    if self.__input in line:
                        self.__result[self.__filenames[i]] = line
                        break
            f.close()
        return self.__result  


class SearchFileHandle(webapp2.RequestHandler):
    def post(self):
        userInput = str(self.request.get('input'))
        self.response.out.write(page.pageChange(userInput))        

        search = SearchFile(userInput)
        tramText = CropText()

        for key,value in search.outPutData().iteritems():
            keyBold = "<b>%s</b><br>"%(key)
            keyLink = "<a href = \"dataFolder/%s\" name =\"%s\"> %s </a>"%(key,key,keyBold)
            self.response.out.write(keyLink)      
            #print >>sys.stderr, "====>", re.search(regex, value,re.IGNORECASE)
            resultContain = tramText.tram(value,userInput)
            for word in resultContain.split(" "):
                if word in userInput:
                    for keyWord in userInput.split(" "):
                        if word == keyWord:
                            self.response.out.write(" <b>%s</b> "%(word))
                else:
                    self.response.out.write(" %s "%(word))
            self.response.out.write("<br><br><br>")    
        news =  TakeNews()
        for key,value in news.websiteRead(userInput).iteritems():
            keyBold = "<b>%s</b><br>"%(key)
            keyLink = "<a href = %s> %s </a>"%(value,keyBold)
            self.response.out.write(keyLink)
            self.response.out.write("<br><br><br>")


        googleSearch = WebSearch(userInput)
        results = googleSearch.returnResult()
        for res in results:
            self.response.out.write(res)
            self.response.out.write("<br><br><br>")


class CropText():
    def tram(self,text,word):
        regex = r"( .*? )"+re.escape(word)+r"( .*?\.)"
        #print >>sys.stderr, text
        if re.search(regex,text,re.IGNORECASE):
            return re.search(regex,text,re.IGNORECASE).group()
        else:
            return ''


class TakeNews():
    def __init__(self):
        self.__website = 'http://www.bloomberg.com'
        self.__topNews =''
        self.__topNewsTitle =''


    def setWebsite(self,website):
        if (website[:10] != 'http://www') and not('http://www' in website):
            website = 'http://www' + website
        self.__website =  website


    def websiteRead(self,userInput):
        webpage = urlopen(self.__website).read()  
        pathFinderTopNewsTitle = re.compile('<a class=\"icon-article-headline\".*<span class=\'headline\'>(.*)</span>')
        pathFinderTopNews = re.compile('<a class=\"icon-article-headline\" data-id=.* data-type=.* href=\"(.*)\"><span class=\'headline\'>')
        self.__topNewsTitle =  re.findall(pathFinderTopNewsTitle,webpage)
        self.__topNews = re.findall(pathFinderTopNews,webpage)
        result = {}
        iterator = range(0,len(self.__topNewsTitle))
        for i in iterator:
            if userInput in self.__topNews[i]:
                result[self.__topNewsTitle[i]] = self.__website+"/"+self.__topNews[i]
        return result

class WebSearch():
    def __init__(self,word):
        self.__search = word
    def returnResult(self):
        gs = GoogleSearch(self.__search)
        gs.results_per_page = 200
        return gs.get_results()


def main():
    app.run()


app = webapp2.WSGIApplication([('/',MainPage),
                            ('/searchFile',SearchFileHandle)
                            ],
                            debug =True)

if __name__ == "__main__":
    main()

【问题讨论】:

  • 请发布错误的完整回溯。你不能在应用引擎上使用os.walk
  • @aschmid00 实际上这部分代码正在工作。 os.walk 用于本地文档搜索。错误消息是: from xgoogle.search import GoogleSearch, SearchError ImportError: No module named xgoogle.search
  • 您的代码中没有提及xgoogle。请使用错误的完整回溯更新问题。
  • @aschmid00 已更新。谢谢
  • @aschmid00 我发现我没有将模块添加到app的文件夹中

标签: python google-app-engine google-api


【解决方案1】:

将bs4模块添加到app文件夹中

【讨论】: