【问题标题】:Extract specifit pages and make new pdf from list of pdfs提取特定页面并从 pdf 列表中制作新的 pdf
【发布时间】:2021-11-21 14:38:59
【问题描述】:

我一直在尝试从每个 pdf 中提取空间页面,然后将所有提取的 pdf 合并一次。

我有 pdf 列表

我正在使用 pdfrw 这个库,但在提取页面时出错

from pdfrw import PdfReader, PdfWriter
import os
files = [f for f in os.listdir(
    '.') if os.path.isfile(f) and f.endswith('.pdf')]

print(files)


for pdf in files:
    pages = PdfReader(pdf).pages
    parts = [(6, 7)]
    for part in parts:
        title = pdf.title().split('.')[0]
        outdata = PdfWriter(f'{title}_{part[0]}_.pdf')
        for pagenum in range(*part):
            outdata.addpage(pages[pagenum-1])
        outdata.write()

如果可能请帮忙

raise PdfParseError('Invalid PDF header: %s' %
pdfrw.errors.PdfParseError: Invalid PDF header: '<!doctype html>'

【问题讨论】:

    标签: python python-3.x pdf automation


    【解决方案1】:

    玛纳斯,

    实现您的要求的一种方法是使用 API。例如,考虑以下代码 sn-p,它将 PDF 从上传的文件中拆分出来。

    import os
    import requests # pip install requests
    
    # The authentication key (API Key).
    # Get your own by registering at https://app.pdf.co
    API_KEY = "*********************************"
    
    # Base URL for PDF.co Web API requests
    BASE_URL = "https://api.pdf.co/v1"
    
    # Source PDF file
    SourceFile = ".\\sample.pdf"
    # Comma-separated list of page numbers (or ranges) to process. Example: '1,3-5,7-'.
    Pages = "1-2,3-"
    
    
    def main(args = None):
        uploadedFileUrl = uploadFile(SourceFile)
        if (uploadedFileUrl != None):
            splitPDF(uploadedFileUrl)
    
    
    def splitPDF(uploadedFileUrl):
        """Split PDF using PDF.co Web API"""
    
        # Prepare requests params as JSON
        # See documentation: https://apidocs.pdf.co
        parameters = {}
        parameters["pages"] = Pages
        parameters["url"] = uploadedFileUrl
    
        # Prepare URL for 'Split PDF' API request
        url = "{}/pdf/split".format(BASE_URL)
    
        # Execute request and get response as JSON
        response = requests.post(url, data=parameters, headers={ "x-api-key": API_KEY })
        if (response.status_code == 200):
            json = response.json()
    
            if json["error"] == False:
    
                # Download generated PNG files
                part = 1
    
                for resultFileUrl in json["urls"]:
                    # Download Result File
                    r = requests.get(resultFileUrl, stream=True)
    
                    localFileUrl = f"Page{part}.pdf"
    
                    if r.status_code == 200:
                        with open(localFileUrl, 'wb') as file:
                            for chunk in r:
                                file.write(chunk)
                        print(f"Result file saved as \"{localFileUrl}\" file.")
                    else:
                        print(f"Request error: {response.status_code} {response.reason}")
    
                    part = part + 1
            else:
                # Show service reported error
                print(json["message"])
        else:
            print(f"Request error: {response.status_code} {response.reason}")
    
    
    def uploadFile(fileName):
        """Uploads file to the cloud"""
        
        # 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.
    
        # Prepare URL for 'Get Presigned URL' API request
        url = "{}/file/upload/get-presigned-url?contenttype=application/octet-stream&name={}".format(
            BASE_URL, os.path.basename(fileName))
        
        # Execute request and get response as JSON
        response = requests.get(url, headers={ "x-api-key": API_KEY })
        if (response.status_code == 200):
            json = response.json()
            
            if json["error"] == False:
                # URL to use for file upload
                uploadUrl = json["presignedUrl"]
                # URL for future reference
                uploadedFileUrl = json["url"]
    
                # 2. UPLOAD FILE TO CLOUD.
                with open(fileName, 'rb') as file:
                    requests.put(uploadUrl, data=file, headers={ "x-api-key": API_KEY, "content-type": "application/octet-stream" })
    
                return uploadedFileUrl
            else:
                # Show service reported error
                print(json["message"])    
        else:
            print(f"Request error: {response.status_code} {response.reason}")
    
        return None
    
    
    if __name__ == '__main__':
        main()
    
    

    现在,要合并 PDF 文件,您可以使用类似于以下代码 sn-p。

    import os
    import requests # pip install requests
    
    # The authentication key (API Key).
    # Get your own by registering at https://app.pdf.co
    API_KEY = "**********************************"
    
    # Base URL for PDF.co Web API requests
    BASE_URL = "https://api.pdf.co/v1"
    
    # Source PDF files
    SourceFile_1 = ".\\sample1.pdf"
    SourceFile_2 = ".\\sample2.pdf"
    
    # Destination PDF file name
    DestinationFile = ".\\result.pdf"
    
    def main(args = None):
        UploadedFileUrl_1 = uploadFile(SourceFile_1)
        UploadedFileUrl_2 = uploadFile(SourceFile_2)
    
        if (UploadedFileUrl_1 != None and UploadedFileUrl_2!= None):
            uploadedFileUrls = "{},{}".format(UploadedFileUrl_1, UploadedFileUrl_2)
            mergeFiles(uploadedFileUrls, DestinationFile)
    
    def mergeFiles(uploadedFileUrls, destinationFile):
        """Perform Merge using PDF.co Web API"""
    
        # Prepare requests params as JSON
        # See documentation: https://apidocs.pdf.co
        parameters = {}
        parameters["name"] = os.path.basename(destinationFile)
        parameters["url"] = uploadedFileUrls
    
        # Prepare URL for 'Merge PDF' API request
        url = "{}/pdf/merge".format(BASE_URL)
    
        # Execute request and get response as JSON
        response = requests.post(url, data=parameters, headers={ "x-api-key": API_KEY })
        if (response.status_code == 200):
            json = response.json()
    
            if json["error"] == False:
                #  Get URL of result file
                resultFileUrl = json["url"]            
                # Download result file
                r = requests.get(resultFileUrl, stream=True)
                if (r.status_code == 200):
                    with open(destinationFile, 'wb') as file:
                        for chunk in r:
                            file.write(chunk)
                    print(f"Result file saved as \"{destinationFile}\" file.")
                else:
                    print(f"Request error: {response.status_code} {response.reason}")
            else:
                # Show service reported error
                print(json["message"])
        else:
            print(f"Request error: {response.status_code} {response.reason}")
    
    
    def uploadFile(fileName):
        """Uploads file to the cloud"""
        
        # 1. RETRIEVE PRESIGNED URL TO UPLOAD FILE.
    
        # Prepare URL for 'Get Presigned URL' API request
        url = "{}/file/upload/get-presigned-url?contenttype=application/octet-stream&name={}".format(
            BASE_URL, os.path.basename(fileName))
        
        # Execute request and get response as JSON
        response = requests.get(url, headers={ "x-api-key": API_KEY })
        if (response.status_code == 200):
            json = response.json()
            
            if json["error"] == False:
                # URL to use for file upload
                uploadUrl = json["presignedUrl"]
                # URL for future reference
                uploadedFileUrl = json["url"]
    
                # 2. UPLOAD FILE TO CLOUD.
                with open(fileName, 'rb') as file:
                    requests.put(uploadUrl, data=file, headers={ "x-api-key": API_KEY, "content-type": "application/octet-stream" })
    
                return uploadedFileUrl
            else:
                # Show service reported error
                print(json["message"])    
        else:
            print(f"Request error: {response.status_code} {response.reason}")
    
        return None
    
    
    if __name__ == '__main__':
        main()
    
    

    在这个示例中,我使用的是 pdf.co API。有关详细信息,请参阅以下链接。

    https://apidocs.pdf.co/30-pdf-split, https://apidocs.pdf.co/31-pdf-merge

    谢谢!

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2019-01-05
      • 2019-11-02
      • 2020-09-29
      • 2022-10-18
      • 2020-10-05
      • 1970-01-01
      • 2023-02-26
      • 2017-10-21
      相关资源
      最近更新 更多