Python 通过 Tor 请求流 - 连接断开答案

【问题标题】：Python Requests Stream Via Tor - Connection DiesPython 通过 Tor 请求流 - 连接断开
【发布时间】：2020-12-06 00:49:48
【问题描述】：

我正在使用 python 请求库以多处理方式在“onionurl”下载文件，以从 tor 服务下载许多文件。

这就是代码背后的原因。

但是，随着这些文件的下载，它们会在一两分钟后被删除。由于流无法下载，没有给出错误但返回“关闭文本文件”。这意味着无法下载托管在这些洋葱服务器上的文件，每个服务器都有数百 GB。

对于解决此问题的任何帮助将不胜感激。

    session = requests.session()
    session.proxies = {}
    session.proxies['http'] = 'socks5h://localhost:9050'
    session.proxies['https'] = 'socks5h://localhost:9050'
    #print(onionurlforrequest)
    

    url = onionurl

    try:
        if not os.path.isdir(foldername):
            os.makedirs(foldername)
        # download the body of response by chunk, not immediately
        with session.get(url, stream=True, verify=False, timeout=1000000) as response:
            # get the total file size
            file_size = int(response.headers.get("Content-Length", 0))
            print(file_size)
            # get the file name

            filename = dataloc


            with open(filename, "wb") as text_file: 
                for chunk in response.iter_content(chunk_size=1024):

                    text_file.write(chunk)
 
                    if (file_size  > 1000000):
                        filesizemb = file_size / 1000000
                    else:
                        filesizemb = 1
            print("closing text file")
            text_file.close()

【问题讨论】：

标签： python python-3.x python-requests multiprocessing tor

【解决方案1】：

设法解决它，只需接受连接将断开并编写一个新函数以在确切的偏移量处恢复下载，该问题的理论在此问题中进行了解释 - How to resume file download in Python?

我的代码（警告，混乱）：

def onionrequestthreadeddataleakdownloadresume(onionurl,resume_byte_pos):
    print("rerunning")
    companyname = onionurl[0]
    onionurl = onionurl[1]
    dataloc = '/media/archangel/Elements/clop/dataleaks/'
    foldername = dataloc
    dataloc = dataloc + companyname + "/"
    try:
       if not os.path.isdir(dataloc):

           os.mkdir(dataloc)
    except Exception as e:

        print(e)
        print("folder not created")


    filename = os.path.basename(onionurl)
    filenamebasename = filename



    dataloc = dataloc + filename

    try:
 #       seconds = 20
  #      timeout = Timeout(seconds)
   #     timeout.start()



        session = requests.session()
        session.proxies = {}
        session.proxies['http'] = 'socks5h://localhost:9050'
        session.proxies['https'] = 'socks5h://localhost:9050'
        #print(onionurlforrequest)
        
      #  onionurlforrequest = "http://" + onionurl
        print("dataloc")
        print(dataloc)
        print("onionurl")
        print(onionurl)
        url = onionurl

        try:
            print("url")
            print(url)
            if not os.path.isdir(foldername):
                os.makedirs(foldername)
            # download the body of response by chunk, not immediately
#https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests?rq=1
            try:
                try:
                    seconds = 20
                    timeout = Timeout(seconds)
                    timeout.start()
                except Exception as ex:
                    print(ex)

                resume_header = {'Accept-Encoding': None, 'Range': 'bytes=%d-' % resume_byte_pos}
                try:
                    with session.get(url, stream=True, verify=False, headers=resume_header, timeout=600) as response:
                        #response.raise_for_status()

                        # get the total file size
                        file_size = int(response.headers['Content-Length'])
                        if (file_size  > 1000000):
                            filesizemb = file_size / 1000000
                        else:
                            filesizemb = 1
                        print(file_size)
                        # get the file name

                        filename = dataloc
            #            filename = os.path.join(dataloc, url.split("/")[-1])
                        # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
             #           response = session.get(url, stream = True)
            #            progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
                        try:
                            with open(filename, "ab") as text_file: 
                                for chunk in response.iter_content(chunk_size=1024*1024):
                                    #https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
                                    #if len(chunk) != 1024*36:
                                    if chunk: 
                                        #print(len(chunk))
                                        text_file.write(chunk)
                                        text_file.flush()
                        except Exception as ex:
                            logging.error(f'write failed with error: {ex}')
                            print(ex)
                                #else:
                                
                                    # write data read to the file
                #                    f.write(data)
                                    # update the progress bar manually
                 #                   progress.update(len(data))
                                # finally, if the url is valid

                        #logging.info('Download finished successfully')

                        print("exited with for file")
                except Exception as ex:
                    logging.error(f'Request failed with error: {ex}')
                    print(ex)

            except Exception as ex:
                logging.error(f'Attempt failed with error: {ex}')
                print(ex)

            print("closing text file")
          #  text_file.close()

                #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize

        except Exception as e:
            print("FAILED DOWNLOAD 2")

            print(e)
    except Exception as e:
        print("FAILED DOWNLOAD 5")
        print(e)












def onionrequestthreadeddataleakdownload2(onionurl):
    companyname = onionurl[0]
    onionurl = onionurl[1]
    dataloc = '/media/archangel/Elements/clop/dataleaks/'
    foldername = dataloc
    dataloc = dataloc + companyname + "/"
    try:
       if not os.path.isdir(dataloc):

           os.mkdir(dataloc)
    except Exception as e:

        print(e)
        print("folder not created")


    filename = os.path.basename(onionurl)
    filenamebasename = filename



    dataloc = dataloc + filename

    try:
 #       seconds = 20
  #      timeout = Timeout(seconds)
   #     timeout.start()



        session = requests.session()
        session.proxies = {}
        session.proxies['http'] = 'socks5h://localhost:9050'
        session.proxies['https'] = 'socks5h://localhost:9050'
        #print(onionurlforrequest)
        
      #  onionurlforrequest = "http://" + onionurl
        print("dataloc")
        print(dataloc)
        print("onionurl")
        print(onionurl)
        url = onionurl

        try:
            print("url")
            print(url)
            if not os.path.isdir(foldername):
                os.makedirs(foldername)
            # download the body of response by chunk, not immediately
#https://stackoverflow.com/questions/16694907/download-large-file-in-python-with-requests?rq=1
            try:
                try:
                    seconds = 20
                    timeout = Timeout(seconds)
                    timeout.start()
                except Exception as ex:
                    print(ex)

               # resume_header = ({'Range': f'bytes=0-2000000'})
                #file_size_online = int(r.headers.get('content-length', 0))
                headersac = {'Accept-Encoding': None}
                try:
                    with session.get(url, stream=True, verify=False, headers = headersac, timeout=600) as response:
                        #response.raise_for_status()

                        # get the total file size
    #                    file_size = int(response.headers.get("Content-Length", 0))
                        file_size = int(response.headers['Content-Length'])
                        if (file_size  > 1000000):
                            filesizemb = file_size / 1000000
                        else:
                            filesizemb = 1
                        print(file_size)
                        #e
                        # get the file name

                        filename = dataloc
            #            filename = os.path.join(dataloc, url.split("/")[-1])
                        # progress bar, changing the unit to bytes instead of iteration (default by tqdm)
             #           response = session.get(url, stream = True)
            #            progress = tqdm(response.iter_content(1024), f"Downloading {filename}", total=file_size, unit="B", unit_scale=True, unit_divisor=1024)
                        try:
                            with open(filename, "wb") as text_file: 
                                for chunk in response.iter_content(chunk_size=1024*1024):
                                    #https://www.kite.com/python/answers/how-to-download-large-files-with-requests-in-python
                                    #if len(chunk) != 1024*36:
                                    if chunk: 
                                       # print(len(chunk))
                                        text_file.write(chunk)
                                        text_file.flush()
                        except Exception as ex:
                            logging.error(f'write failed with error: {ex}')
                            print(ex)
                                #else:
                                
                                    # write data read to the file
                #                    f.write(data)
                                    # update the progress bar manually
                 #                   progress.update(len(data))
                                # finally, if the url is valid

                        #logging.info('Download finished successfully')
                except Exception as ex:
                    logging.error(f'request failed with error: {ex}')
                    print(ex)
                    print("exited with for file")
                #path = Path(filename)
                file_size_offline = Path(filename).stat().st_size
                print("file size offline")
                while (file_size_offline != file_size):
                    try:
                        print(file_size_offline)
                        print(file_size)
                        print("file size incomplete")
                        file_size_offline = Path(filename).stat().st_size
                        onionurllist = []
                        onionurllist.append(companyname)

                        onionurllist.append(onionurl)
                        onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                        file_size_offline = Path(filename).stat().st_size

                    except Exception as ex:
                        print("redownload failed")
                        print(ex)
                print("LOOP FINISHED")

                print(file_size)
                print(file_size_offline)
                print(filename)
            except Exception as ex:
                logging.error(f'Attempt failed with error: {ex}')
                print(ex)

#            print("closing text file")
          #  text_file.close()
            if(file_size_offline != file_size):
                while (file_size_offline != file_size):
                    try:
                        print(file_size_offline)
                        print(file_size)
                        print("file size incomplete")
                        file_size_offline = Path(filename).stat().st_size
                        onionurllist = []
                        onionurllist.append(companyname)

                        onionurllist.append(onionurl)
                        onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                        file_size_offline = Path(filename).stat().st_size

                    except Exception as ex:
                        print("redownload failed")
                        print(ex)
            else:
                #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
                returnedlist = []
                returnedlist.append(dataloc)
                returnedlist.append(filenamebasename)
                returnedlist.append(url)
                returnedlist.append(filesizemb)
                return returnedlist
            if(file_size_offline != file_size):
                print("rerunning a final FINAL time")
                while (file_size_offline != file_size):
                    try:
                        print(file_size_offline)
                        print(file_size)
                        print("file size incomplete")
                        file_size_offline = Path(filename).stat().st_size
                        onionurllist = []
                        onionurllist.append(companyname)

                        onionurllist.append(onionurl)
                        onionrequestthreadeddataleakdownloadresume(onionurllist, file_size_offline)
                        file_size_offline = Path(filename).stat().st_size

                    except Exception as ex:
                        print("redownload failed")
                        print(ex)
            else:
                #list composed of dataleaklocation (location in external), filename (filename after / slash) , dataleakurl (urlofonion) , contentsize
                returnedlist = []
                returnedlist.append(dataloc)
                returnedlist.append(filenamebasename)
                returnedlist.append(url)
                returnedlist.append(filesizemb)
                return returnedlist
                


            returnedlist = []
            returnedlist.append(dataloc)
            returnedlist.append(filenamebasename)
            returnedlist.append(url)
            returnedlist.append(filesizemb)
            return returnedlist
        except Exception as e:
            print("FAILED DOWNLOAD 2")

            print(e)
    except Exception as e:
        print("FAILED DOWNLOAD 5")
        print(e)

【讨论】：