【问题标题】:How to scrape the URL, Title, and Description of Google Search Results如何抓取 Google 搜索结果的 URL、标题和描述
【发布时间】:2023-02-05 22:01:43
【问题描述】:

我正在使用 selenium 首先向 Google 提问,然后抓取前几个结果。我正在尝试将所有 URL、标题和描述添加到一个 Dict 中,然后我可以稍后访问它。不幸的是,我无法让它工作 - 返回“未找到数据”。有谁知道可能是什么问题?

这是我在做什么:

options = Options()
options.add_argument("--headless")

def googleSearch(query):
    # specifing browser web driver
    driver = webdriver.Chrome(options=options, executable_path='chromedriver')
    
    # search query
    search_engine = "https://www.google.com/search?q="
    query = query.replace(" ","+")
    driver.get(search_engine + query + "&start=" + "0")

    # stored data
    # which will be returned by this function
    data = {}
    
    # number of search reasult count of first page
    s_len = 5
    
    for s_block in range(s_len):

        # result block
        content_block_xpath = f'''//*[@id="yuRUbf"]/div[{s_block}]/div/div'''

        # xpaths
        xpath_url = f"""{content_block_xpath}/div[1]/a"""
        xpath_title = f"""{content_block_xpath}/div[1]/a/h3"""
        xpath_description = f"""{content_block_xpath}/div[2]/span/span"""
        
        try:
                        
            # store data collected of each s_block to block {}
            block = {}

            # find url of content
            url = driver.find_element(By.XPATH, xpath_url)
            url = url.get_attribute('href')

            links.append(url.get('href'))
            # find domain name of web having content
            pattern =  r"""(https?:\/\/)?(([a-z0-9-_]+\.)?([a-z0-9-_]+\.[a-z0-9-_]+))"""
            domain = re.search(pattern, url)[0]
            print(links)

            # find title of content
#            title = driver.find_element_by_xpath(xpath_title)
            title = driver.find_element(By.XPATH, xpath_title)
            title = title.get_attribute("innerText")
            
            # find description of content
#            description = driver.find_element_by_xpath(xpath_description)
            description = driver.find_element(By.XPATH, xpath_description)
            description = description.get_attribute("innerText")

            # save all data to block {}
            block["domain"] = domain
            block["url"] = url
            block["title"] = title
            block["description"] = description

            # save block dictionary to main dictionary
            data[f'{s_block}'] = block
        
        except exceptions.NoSuchElementException:
            continue
        
        if len(data) == 0:
            raise Exception("No data found")

    driver.close()
    return data

def getQuery():
    query = str('How to change a car tire')
    link = googleSearch(query)
    print(link)

getQuery()

【问题讨论】:

    标签: python selenium web-scraping


    【解决方案1】:

    我看到两个问题:

    • 关于使用“yuRUbf”的类和 ID 混淆
    • xpath 中的索引从 1 而不是 0 开始
    我也没有得到与您相同的层次结构,但这只是一个调整。 以下为我产生了合理的结果:
    content_block_xpath = f'''(//*[@class="yuRUbf"])[{s_block}]'''
    
    xpath_url = f"""{content_block_xpath}/a"""
    xpath_title = f"""{content_block_xpath}/a/h3"""
    xpath_description = f"""{content_block_xpath}/a//cite/span"""
    

    【讨论】:

      【解决方案2】:

      您只能使用 BeautifulSoup 网络抓取库在没有 Selenium 网络驱动程序的情况下抓取 Google 搜索,因为数据不是通过 JS 处理的,它会加速脚本。

      以下是使用 bs4requests 包从 Google 搜索结果中提取标题、链接和 sn-p(描述)的方法:

      params = {
          "q": "How to change a car tire",    # query example
          "hl": "en",                         # language
          "gl": "uk",                         # country of the search, UK -> United Kingdom
          "start": 0,                         # number page by default up to 0
          #"num": 100                     # parameter defines the maximum number of results to return.
      }
      html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
      soup = BeautifulSoup(html.text, 'lxml')
          
      for result in soup.select(".tF2Cxc"):
          title = result.select_one(".DKV0Md").text
          try:
             snippet = result.select_one(".lEBKkf span").text
          except:
             snippet = None
          links = result.select_one(".yuRUbf a")["href"]
      

      您不仅可以提取第一页,还可以使用无限 while 循环的分页提取所有其余页面。

      在这种情况下,只要存在下一个按钮就可以分页(由页面上是否存在按钮选择器决定,在我们的例子中是 CSS 选择器.d6cvqb a[id=pnnext],您需要将 ["start"] 的值增加到10 访问下一页(这可能称为non-token pagination),如果存在,否则,我们需要退出 while 循环:

      if soup.select_one('.d6cvqb a[id=pnnext]'):
          params["start"] += 10
      else:
          break
      

      检查online IDE中的代码

      from bs4 import BeautifulSoup
      import requests, json, lxml
      
      # https://docs.python-requests.org/en/master/user/quickstart/#passing-parameters-in-urls
      params = {
          "q": "How to change a car tire",    # query example
          "hl": "en",                         # language
          "gl": "uk",                         # country of the search, UK -> United Kingdom
          "start": 0,                         # number page by default up to 0
          #"num": 100                         # parameter defines the maximum number of results to return.
      }
      
      # https://docs.python-requests.org/en/master/user/quickstart/#custom-headers
      headers = {
          "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
      }
      
      page_num = 0
      
      data = []
      
      while True:
          page_num += 1
          print(f"page: {page_num}")
              
          html = requests.get("https://www.google.com/search", params=params, headers=headers, timeout=30)
          soup = BeautifulSoup(html.text, 'lxml')
          
          for result in soup.select(".tF2Cxc"):
              title = result.select_one(".DKV0Md").text
              try:
                 snippet = result.select_one(".lEBKkf span").text
              except:
                 snippet = None
              links = result.select_one(".yuRUbf a")["href"]
            
              data.append({
                "title": title,
                "snippet": snippet,
                "links": links
              })
            
          if soup.select_one(".d6cvqb a[id=pnnext]"):
              params["start"] += 10
          else:
              break
      print(json.dumps(data, indent=2, ensure_ascii=False))
      

      示例输出:

      [
        {
          "title": "How Long Do Tires Last and When Should I Replace Them?",
          "snippet": "As a general rule, we recommend every 5,000-7,000 miles, but it depends on numerous factors, including your car's alignment. You can read more on The Drive's ...",
          "links": "https://www.thedrive.com/cars-101/35041/how-long-do-tires-last"
        },
        {
          "title": "Car Tire Valve Stem Replacement - iFixit Repair Guide",
          "snippet": "Step 1 Car Tire Valve Stem · Locate the stem valve and remove the cap. · Using the Schrader valve core bit in your 1/4" driver, unscrew the valve core from the ...",
          "links": "https://www.ifixit.com/Guide/Car+Tire+Valve+Stem+Replacement/121415"
        },
        other results ...
      ]
      

      您也可以使用来自 SerpApi 的 Google Search Engine Results API。它是带有免费计划的付费 API。 不同之处在于它将绕过来自 Google 的块(包括 CAPTCHA),无需创建解析器和维护它。

      代码示例:

      from serpapi import GoogleSearch
      from urllib.parse import urlsplit, parse_qsl
      import json, os
      
      params = {
        "api_key": "...",                  # serpapi key, https://serpapi.com/manage-api-key
        "engine": "google",                # serpapi parser engine
        "q": "How to change a car tire",   # search query
        "gl": "uk",                        # country of the search, UK -> United Kingdom
        "num": "100"                       # number of results per page (100 per page in this case)
        # other search parameters: https://serpapi.com/search-api#api-parameters
      }
      
      search = GoogleSearch(params)      # where data extraction happens
      
      organic_results_data = []
      page_num = 0
      
      while True:
          results = search.get_dict()    # JSON -> Python dictionary
          
          page_num += 1
          
          for result in results["organic_results"]:
              organic_results_data.append({
                  "title": result.get("title"),
                  "snippet": result.get("snippet"),
                  "link": result.get("link")
              })
          
          if "next_link" in results.get("serpapi_pagination", []):
              search.params_dict.update(dict(parse_qsl(urlsplit(results.get("serpapi_pagination").get("next_link")).query)))
          else:
              break
          
      print(json.dumps(organic_results_data, indent=2, ensure_ascii=False)) 
      

      输出:

      [
         {
          "title": "Today: can you safely change a tire with passengers on board?",
          "snippet": "RAY: In any case, the primary danger during a tire change is that the vehicle will slip off the jack and injure the tire changer.",
          "link": "https://www.cartalk.com/content/today-can-you-safely-change-tire-passengers-board"
        },
        {
          "title": "How to Change a Flat Tire - Mercedes-Benz Burlington",
          "snippet": "How to Switch a Tire in 5 Steps · Secure the wheel wedges against the tires on the opposite side of the flat tire. · Remove the hubcap or wheel ...",
          "link": "https://www.mercedes-benz-burlington.ca/how-to-change-a-flat-tire/"
        },
        other results...
      ]
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 2020-06-23
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2020-04-14
        • 1970-01-01
        相关资源
        最近更新 更多