Python图像抓取答案

【问题标题】：Python image scrapingPython图像抓取
【发布时间】：2020-04-24 11:07:21
【问题描述】：

作为我论文的一部分，我正在尝试为deep learning 构建一个大型训练数据语料库。

我的代码在 Conda 3.7 上运行并且运行良好，直到我尝试抓取 80 多个图像。就我而言，我需要几百张照片。

Python 代码

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup
import time
import io
import os
from PIL import Image
import hashlib

def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)

    # Build the Google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # Load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # Get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)

        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")

        for img in thumbnail_results[results_start:number_results]:
            # Try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # Extract image URLs
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more...")
            time.sleep(1)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # Move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

def persist_image(folder_path:str, url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path, hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

# As soon as the number of images is over 80, an error is shown
def search_and_download(search_term:str, target_path='./images', number_images=170):
    target_folder = os.path.join(target_path, '_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    with webdriver.Chrome() as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)

    for elem in res:
        persist_image(target_folder, elem)

# Change here to modify the search query

search_term = 'Hecht'

search_and_download(
    search_term = search_term,
)

以及日志中的错误

找到：93 个图片链接，正在寻找更多...
回溯（最近一次通话最后一次）：
文件“C:\Users\User\Desktop\Scraping\image-gathering-selenium\scrapy2.py”，第 103 行，在 search_term = search_term,

文件“C:\Users\User\Desktop\Scraping\image-gathering-selenium\scrapy2.py”，第 94 行，在 search_and_download
资源中的元素：
TypeError: 'NoneType' 对象不可迭代

【问题讨论】：

Conda 是一个包管理器。你是说Anaconda（一个Python发行版）吗？请通过editing your question（视情况而定）回复，而不是在 cmets 中（without "Edit:"、"Update:" 或类似的 - 问题应该看起来好像今天写的）。

标签： python python-3.x selenium

【解决方案1】：

您正在尝试迭代 res，但它是 None。这就是它抛出错误的原因。

添加一个if条件：

if res:
    for elem in res:
       persist_image(target_folder, elem)

或者在这一行添加一个空白列表：

res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5) or []

【讨论】：

谢谢！错误消失了，但不知何故它仍然只能找到最多 80 张图片。

【解决方案2】：

尝试删除返回：

    else:
        print("Found:", len(image_urls), "image links, looking for more ...")
        time.sleep(1)
        # return
        load_more_button = wd.find_element_by_css_selector(".mye4qd")
        if load_more_button:
            wd.execute_script("document.querySelector('.mye4qd').click();")

【讨论】：

解释一下。