【问题标题】:Scraped images is corrupt抓取的图像已损坏
【发布时间】:2017-04-21 03:40:36
【问题描述】:

您好,我尝试使用以下代码抓取 digg.com 上的首页图片。问题是 0.jpg 到 6.jpg 是正常的。从 7.jpg 开始到 47.jpg 已损坏。不知道为什么。

这里是代码。 Github在这里:https://github.com/kenpeter/py_mm

# os
import os
# http request
import requests
#
import pprint

import time

# import html from lxml
from lxml import html

# global
global_page_num = 0
pp = pprint.PrettyPrinter(indent=4)

# write to file
def download_image(img_urls):
    # total img urls
    amount = len(img_urls)

    # loop
    for index, value in enumerate(img_urls, start=0):
        # file name
        filename = 'img/%s.jpg' % (index)
        # dir
        os.makedirs(os.path.dirname(filename), exist_ok=True)

        print('--- start ---')
        print('filename: %s' % filename)
        print('Downloading: %s out of %s' % (index, amount))

        # open file
        with open(filename, 'wb') as f:
            # f write
            # time.sleep(1)
            f.write(requests.get(value).content)


def get_page_number(num):
    url = 'http://digg.com'
    response = requests.get(url).content
    selector = html.fromstring(response)

    img_urls = []
    img_urls = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src")

    news_texts = []
    news_texts = selector.xpath("//div[@itemprop='description']/text()")

    # test
    # print('--- something ---')
    # pp.pprint(img_urls)
    # pp.pprint(news_texts)

    download_image(img_urls)

    return img_urls


if __name__ == '__main__':
    # input, page_number, everything into the var
    # page_number = input('Please enter the page number that you want to scrape:')

    # global_page_num
    # global_page_num = page_number;
    # print('hell world!');

    page_number = 4 # hardcode
    get_page_number(page_number)

【问题讨论】:

    标签: python python-3.x web-scraping digg


    【解决方案1】:

    图像“损坏”的原因是页面内的方案发生了变化,图像开始“隐藏”在属性data-src 中,而不是src 你用代码抓取的内容。请参阅此处具有两个属性的抓取页面的源代码示例:

    <img
    class="digg-story__image-img js--digg-story__image-img lazy-image-img need-offset"
    data-src="http://static.digg.com/images/f0b92c2d8a2c4b7f829abbc0e58a408c_2oijd0Z_1_www_large_thumb.jpeg"
    src="http://static.digg.com/static/fe/944294/images/x_455x248.png"
    width="312"
    height="170"
    alt=""
    />
    

    换句话说,在创建图像 URL 列表时,您必须检查 srcdata-src 两个属性,使 data-src 优先于 src

    此代码执行“技巧”并下载正确的图像:

    # os
    import os
    # http request
    import requests
    #
    import pprint
    
    import time
    
    # import html from lxml
    from lxml import html
    
    # global
    global_page_num = 0
    pp = pprint.PrettyPrinter(indent=4)
    
    # write to file
    def download_image(img_urls):
        # total img urls
        amount = len(img_urls)
    
        # loop
        for index, value in enumerate(img_urls, start=0):
            # file name
            filename = 'img/%s.jpg' % (index)
            # dir
            os.makedirs(os.path.dirname(filename), exist_ok=True)
    
            print('--- start ---')
            print('filename: %s' % filename)
            print('Downloading: %s out of %s' % (index, amount))
    
            # open file
            with open(filename, 'wb') as f:
                # f write
                # time.sleep(1)
                f.write(requests.get(value).content)
    
    
    def get_page_number(num):
        url = 'http://digg.com'
        response = requests.get(url).content
        selector = html.fromstring(response)
    
        img_urls = []
        img_urls_1a = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@src")
        img_urls_1b = [item for item in img_urls_1a if 'x_455x248.png' not in item]
        img_urls_2 = selector.xpath("//div[@class='digg-story__image--thumb']/a/img/@data-src")
        img_urls = img_urls_1b + img_urls_2
        # print(img_urls)
        news_texts = []
        news_texts = selector.xpath("//div[@itemprop='description']/text()")
    
        # test
        # print('--- something ---')
        # pp.pprint(img_urls)
        # pp.pprint(news_texts)
    
        download_image(img_urls)
    
        return img_urls
    
    
    if __name__ == '__main__':
        # input, page_number, everything into the var
        # page_number = input('Please enter the page number that you want to scrape:')
    
        # global_page_num
        # global_page_num = page_number;
        # print('hell world!');
    
        page_number = 4 # hardcode
        get_page_number(page_number)
    

    【讨论】:

      猜你喜欢
      • 2017-06-23
      • 1970-01-01
      • 1970-01-01
      • 2018-09-26
      • 2021-03-15
      • 2015-11-14
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多