【问题标题】:How to Use vectorize or Apply instead of iterrows on pandas dataframe in python如何在 python 中的 pandas 数据帧上使用矢量化或应用而不是 iterrows
【发布时间】:2017-12-06 17:11:45
【问题描述】:

我有 2000 多个包含两列的数据框。我想在列上使用 ngrams,然后使用 ngrams 创建一个新的数据框。这是我的代码。它工作正常。只是要花很多时间。

我目前正在使用 itterows 来遍历每个文件中每个数据帧的每一行。有没有更简单的方法可以使用矢量化或应用来做到这一点?

import logging
import os
from os import listdir
from os.path import isfile, join
import math
import pickle
import itertools
import multiprocessing
import psutil
import numpy as np
import pandas as pd
import time

def create_combinations(file):
    initial_path ='./to_process/'
    final_path = './processed/'
    custom = pd.read_pickle(initial_path+file, compression='gzip')
    custom = custom.drop_duplicates(subset=['category', 'element'])
    custom['element'] = custom['element'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
    total_rows = len(custom.index)
    logging.warning('Processing element : ' + file + 'Number of rows to combine: '+ str(total_rows))
    # if total_rows > cores:
    #     partitions = math.floor(total_rows/cores)
    # logging.warning('Number of partitions : ' + str(partitions))
    if total_rows > 0:
        combined_df = pd.DataFrame(columns=['category', 'element'])
        logging.warning('creating combinations')
        for key, data in custom.iterrows():
            words = data['element']#.split()
            logging.warning(words)
            words2 = words.replace('%', '%%').replace(' ', '%s')
            logging.warning('Number of words to combine: '+ str(len(words.split())))
            k = 0
            df1 = pd.DataFrame(columns=['category', 'element'])
            for i in itertools.product((' ', ''), repeat=words.count(' ')):
                df1.loc[k, 'element'] = (words2 % i)
                df1.loc[k, 'category'] = data['category']
                k += 1
            combined_df = pd.concat([combined_df,df1], axis=0)
            del df1
        combined_df.to_pickle(final_path + file, compression='gzip')
        combined_df.to_csv(final_path + os.path.splitext(file)[0]+'.csv') 
        del combined_df
        del custom
            # partitions = 1
        logging.warning('completed ' + file)
    else:
        logging.warning('No rows to process')





if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    partitions = 1 #number of partitions to split dataframe
    cores = 2 #number of cores on your machine
    path ='./to_process/'
    combi_path = './processed/'
    files = [f for f in listdir(path) if isfile(join(path, f))]
    pickle_files=[]
    for any_file in files:
        if any_file.endswith('.pickle'):
            if os.path.isfile(combi_path+any_file):
                logging.warning(any_file +' already processed.')
            else:
                pickle_files.insert(len(pickle_files),any_file)
    p = multiprocessing.Pool(processes = len(pickle_files))
    start = time.time()
    async_result = p.map_async(create_combinations, pickle_files)
    p.close()
    p.join()
    print("Complete")
    end = time.time()
    print('total time (s)= ' + str(end-start))

在此处输入代码

【问题讨论】:

    标签: python pandas dataframe vectorization apply


    【解决方案1】:
    # pylint: disable=I0011
    # pylint: disable=C0111
    # pylint: disable=C0301
    # pylint: disable=C0103
    # pylint: disable=W0612
    # pylint: disable=W0611
    import logging
    import os
    from os import listdir
    from os.path import isfile, join
    import math
    import pickle
    import itertools
    import multiprocessing
    import time
    import gc
    import numpy as np
    import pandas as pd
    
    
    def create_combinations(file):
        logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
        initial_path ='./training/pickles/standard and documentation/custom_elements/trial/'
        final_path = './training/pickles/standard and documentation/custom_element_combination_trial/'
        completed_file_path ='./training/pickles/standard and documentation/custom_elements_processed_trial/'
        custom = pd.read_pickle(initial_path+file, compression='gzip')
        custom = custom.drop_duplicates(subset=['category', 'element'])
        custom['element'] = custom['element'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
        total_rows = len(custom.index)
        logging.warning('Processing element : ' + file + 'Number of rows to combine: '+ str(total_rows))
        cat = []
        ele = []
        combined_df = pd.DataFrame(columns=['category', 'element'])
        logging.warning('creating combinations')
        k=1
        for key, data in custom.iterrows():
            words = data['element']#.split()
            logging.warning(words)
            words2 = words.replace('%', '%%').replace(' ', '%s')
            logging.warning('Number of words to combine: '+ str(len(words.split())))
            for i in itertools.product((' ', ''), repeat=words.count(' ')):
                ele.append(words2 % i)
                cat.append(data['category'])
            lst = zip(cat,ele)
            if len(lst) > 200000:
                del cat
                del ele
                combined_df = pd.DataFrame.from_records(lst,columns=['category','element'])
                del lst
                combined_df.to_pickle(final_path + os.path.splitext(file)[0] + str(k)+'.pickle', compression='gzip')
                combined_df.to_csv(final_path + os.path.splitext(file)[0] + str(k)+'.csv') 
                #del combined_df
                gc.collect()
                k+=1
        del cat
        del ele
        combined_df = pd.DataFrame.from_records(lst,columns=['category','element'])
        del lst
        combined_df.to_pickle(final_path + os.path.splitext(file)[0] + str(k)+'.pickle', compression='gzip')
        combined_df.to_csv(final_path + os.path.splitext(file)[0] + str(k)+'.csv') 
        del combined_df
        gc.collect()
        del custom
        del words
        del words2
        logging.warning('completed ' + file)
        os.rename(initial_path+file, completed_file_path+file)
        os.rename(initial_path+os.path.splitext(file)[0]+'.csv', completed_file_path+os.path.splitext(file)[0]+'.csv')
        return True
    
    
    if __name__ == "__main__":
        logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
        partitions = 1 #number of partitions to split dataframe
        cores = 6 #number of cores on your machine
        path ='./training/pickles/standard and documentation/custom_elements/trial/'
        combi_path = './training/pickles/standard and documentation/custom_element_combination_trial/'
        processed_file_path ='./training/pickles/standard and documentation/custom_elements_processed_trial/'
        files = [f for f in listdir(path) if isfile(join(path, f))]
        pickle_files=[]
        for any_file in files:
            if any_file.endswith('.pickle'):
                if os.path.isfile(combi_path+any_file):
                    os.rename(path+any_file, processed_file_path+any_file)
                    os.rename(path+os.path.splitext(any_file)[0]+'.csv', processed_file_path+os.path.splitext(any_file)[0]+'.csv')
                    logging.warning(any_file +' already processed.')
                else:
                    df = pd.read_pickle(path+any_file, compression='gzip')
                    rows = len(df.index)
                    if rows > 0:
                        #if rows < 500:
                        pickle_files.insert(len(pickle_files),any_file)
                        # else:
                        #     continue
                    else:
                        os.rename(path+any_file, processed_file_path+any_file)
                        os.rename(path+os.path.splitext(any_file)[0]+'.csv', processed_file_path+os.path.splitext(any_file)[0]+'.csv')
                    del df
                    gc.collect()
                    del rows
                    gc.collect()
        ctx = multiprocessing.get_context('spawn')
        p = ctx.Pool(processes=cores, maxtasksperchild=1000)
        start = time.time()
        async_result = p.map_async(create_combinations, pickle_files)
        p.close()
        p.join()
        print("Complete")
        end = time.time()
        print('total time (s)= ' + str(end-start))
    

    【讨论】:

      猜你喜欢
      • 2020-08-02
      • 2019-12-21
      • 1970-01-01
      • 2019-01-30
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2017-11-07
      • 2021-09-12
      相关资源
      最近更新 更多