检查文件是否已下载，如果已下载则跳过？答案

【问题标题】：Check list if file has downloaded and skip if it has?检查文件是否已下载，如果已下载则跳过？
【发布时间】：2020-06-14 15:44:24
【问题描述】：

我是 Python 新手，确信可以优化以下内容，但是我在脚本的最后一步遇到了问题。

目标不是下载之前已下载的文件。此时我将下载记录在一个名为 download_history.log 的文件中

因此，我需要在此处实施检查以执行以下检查日志 - 如果它存在于日志中，则不执行任何操作，如果不存在则移动到下一个文件下载文件并将其登录到文件中。

任何帮助将不胜感激。

#!/usr/bin/env python3

import boto
import sys, os
import zipfile
import shutil
import glob
import re
from boto.s3.key import Key
from boto.exception import S3ResponseError


#Make the download files
DOWNLOAD_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Downloads/"
if not os.path.exists(DOWNLOAD_LOCATION_PATH):
    print ("Making download directory")
    os.mkdir(DOWNLOAD_LOCATION_PATH)

#Delete Output Folder if it exsists
OUTPUT_FOLDER = os.path.expanduser("~") + "/AWSSplunk/Output/"
shutil.rmtree(OUTPUT_FOLDER)

#Define the AWS Bucket
def backup_s3_folder():
    BUCKET_NAME = "my-bucket-name"
    AWS_ACCESS_KEY_ID= os.getenv("##################")
    AWS_ACCESS_SECRET_KEY = os.getenv("#########################")
    conn  = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_ACCESS_SECRET_KEY)
    bucket = conn.get_bucket(BUCKET_NAME)

    #goto through the list of files
    bucket_list = bucket.list()    

    for l in bucket_list:
        key_string = str(l.key)
        s3_path = DOWNLOAD_LOCATION_PATH + key_string
        try:

            # Add files to the log file
            print ("Downloading file ", key_string)
            file_object = open('download_history.log', 'a')
            file_object.write(key_string)
            file_object.write("\n")

            # Working code
            file_object.close()
            l.get_contents_to_filename(s3_path)
        except (OSError,S3ResponseError) as e:
            pass
            # check if the file has been downloaded locally  
            if not os.path.exists(s3_path):
                try:
                    os.makedirs(s3_path)
                except OSError as exc:
                    # let guard againts race conditions
                    import errno
                    if exc.errno != errno.EEXIST:
                        raise

if __name__ == '__main__':
    backup_s3_folder()

# Start the unzipping process

print("Unzipping Starting")
dir_path = os.path.expanduser("~") + "/AWSSplunk/Downloads/"
for path, dir_list, file_list in os.walk(dir_path):
    for file_name in file_list:
        if file_name.endswith(".zip"):
            abs_file_path = os.path.join(path, file_name)

            parent_path = os.path.split(abs_file_path)[0]
            output_folder_name = os.path.splitext(abs_file_path)[0]
            output_path = os.path.join(parent_path, output_folder_name)

            zip_obj = zipfile.ZipFile(abs_file_path, 'r')
            zip_obj.extractall(output_path)
            zip_obj.close()
print("Unzipping Completed")

# Start moving files to output
print("Moving Files")

FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"

if not os.path.exists(FILE_LOCATION_PATH):
    print ("Making download directory")
    os.mkdir(FILE_LOCATION_PATH)

# .log files move
for root, dirs, files in os.walk(dir_path):
    for file in files:
        if file.endswith('.log'): 
            count = 1
            destination_file = os.path.join(FILE_LOCATION_PATH, file)
            while os.path.exists(destination_file):
                destination_file = os.path.join(FILE_LOCATION_PATH, f"{file}_{count}")
                count += 1
            shutil.move(os.path.join(root, file), destination_file)

# .txt files move
for root, dirs, files in os.walk(dir_path):
    for file in files:
        if file.endswith('.txt'):
            count = 1
            destination_file = os.path.join(FILE_LOCATION_PATH, file)
            while os.path.exists(destination_file):
                destination_file = os.path.join(FILE_LOCATION_PATH, f"{file}_{count}")
                count += 1
            shutil.move(os.path.join(root, file), destination_file)

# .json files move
for root, dirs, files in os.walk(dir_path):
    for file in files:
        if file.endswith('.json'):
            count = 1
            destination_file = os.path.join(FILE_LOCATION_PATH, file)
            while os.path.exists(destination_file):
                destination_file = os.path.join(FILE_LOCATION_PATH, f"{file}_{count}")
                count += 1
            shutil.move(os.path.join(root, file), destination_file)


print("Files Move Complete")
# Delete Directory
print("Cleaning up Downloads Directory")
shutil.rmtree(DOWNLOAD_LOCATION_PATH)

# Remove EFR Audit Logs stratinbg with 2020
print("Remove the encrypted Audit Logs")
pattern = "^(2020)"
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
for root, dirs, files in os.walk(FILE_LOCATION_PATH):
    for file in filter(lambda x: re.match(pattern, x), files):
        os.remove(os.path.join(root, file))

# Remove EFR Audit Logs stratinbg with EFR
pattern = "^(EFR)"
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
for root, dirs, files in os.walk(FILE_LOCATION_PATH):
    for file in filter(lambda x: re.match(pattern, x), files):
        os.remove(os.path.join(root, file))

# Remove EFR Audit Logs stratinbg with 2019
pattern = "^(2019)"
FILE_LOCATION_PATH = os.path.expanduser("~") + "/AWSSplunk/Output/"
for root, dirs, files in os.walk(FILE_LOCATION_PATH):
    for file in filter(lambda x: re.match(pattern, x), files):
        os.remove(os.path.join(root, file))

# Script clean up        
print("Script Complete")

#with open("download_history.log", "a") as myfile:
#    myfile.write('New Line\n')

【问题讨论】：

如果你想在 download_history.log 中搜索要下载的文件名，你可以像这个答案建议的那样做：stackoverflow.com/a/4940068/8446061

标签： python amazon-web-services amazon-s3 aws-sdk boto3

【解决方案1】：

使用 os 可以检查文件是否存在：

if not os.isfile(PATH_TO_EXPECTED_DOWNLOADED_FILE):
    #do download

为了您自己的安全，请将您的步骤分成功能并构建这些功能的管道。

【讨论】：

我应该在哪里将它添加到我的脚本中。
这说起来很复杂，因为您的代码没有遵循任何推荐的准则。其实我找不到你下载数据的地方？请记住，StackOverFlow 不是为了调整您的代码，而是为了回答您的问题或为社区提供最佳实践解决方案。
同意 - 上次我发布了一个问题，我被要求输入所有代码，所以这就是我所做的，下次会这样做。您的答案的问题是它检查文件是否存在 - 但是我解压缩文件然后删除拉链。不过，我已经解决了这个问题。