【发布时间】:2017-12-06 17:11:45
【问题描述】:
我有 2000 多个包含两列的数据框。我想在列上使用 ngrams,然后使用 ngrams 创建一个新的数据框。这是我的代码。它工作正常。只是要花很多时间。
我目前正在使用 itterows 来遍历每个文件中每个数据帧的每一行。有没有更简单的方法可以使用矢量化或应用来做到这一点?
import logging
import os
from os import listdir
from os.path import isfile, join
import math
import pickle
import itertools
import multiprocessing
import psutil
import numpy as np
import pandas as pd
import time
def create_combinations(file):
initial_path ='./to_process/'
final_path = './processed/'
custom = pd.read_pickle(initial_path+file, compression='gzip')
custom = custom.drop_duplicates(subset=['category', 'element'])
custom['element'] = custom['element'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
total_rows = len(custom.index)
logging.warning('Processing element : ' + file + 'Number of rows to combine: '+ str(total_rows))
# if total_rows > cores:
# partitions = math.floor(total_rows/cores)
# logging.warning('Number of partitions : ' + str(partitions))
if total_rows > 0:
combined_df = pd.DataFrame(columns=['category', 'element'])
logging.warning('creating combinations')
for key, data in custom.iterrows():
words = data['element']#.split()
logging.warning(words)
words2 = words.replace('%', '%%').replace(' ', '%s')
logging.warning('Number of words to combine: '+ str(len(words.split())))
k = 0
df1 = pd.DataFrame(columns=['category', 'element'])
for i in itertools.product((' ', ''), repeat=words.count(' ')):
df1.loc[k, 'element'] = (words2 % i)
df1.loc[k, 'category'] = data['category']
k += 1
combined_df = pd.concat([combined_df,df1], axis=0)
del df1
combined_df.to_pickle(final_path + file, compression='gzip')
combined_df.to_csv(final_path + os.path.splitext(file)[0]+'.csv')
del combined_df
del custom
# partitions = 1
logging.warning('completed ' + file)
else:
logging.warning('No rows to process')
if __name__ == "__main__":
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
partitions = 1 #number of partitions to split dataframe
cores = 2 #number of cores on your machine
path ='./to_process/'
combi_path = './processed/'
files = [f for f in listdir(path) if isfile(join(path, f))]
pickle_files=[]
for any_file in files:
if any_file.endswith('.pickle'):
if os.path.isfile(combi_path+any_file):
logging.warning(any_file +' already processed.')
else:
pickle_files.insert(len(pickle_files),any_file)
p = multiprocessing.Pool(processes = len(pickle_files))
start = time.time()
async_result = p.map_async(create_combinations, pickle_files)
p.close()
p.join()
print("Complete")
end = time.time()
print('total time (s)= ' + str(end-start))
在此处输入代码
【问题讨论】:
标签: python pandas dataframe vectorization apply