我不喜欢 pandas 中的 .apply() 方法,因为它确实效率不高。
这是我的另一个解决方案,可帮助您有效地处理问题。我还做了一个基准测试来证明.apply() 确实效率低下。而当涉及到大数据时,您必须仅在必要时使用它。
df['Date'] = df.loc[:, 'Release Date'][:12] + ' ' + df['Time']
这一行的意思是:从 0 到 12 的所有索引(不包括),从“发布日期”列的所有行中,添加一个空格,添加“时间”列(隐含的意思是所有行) .
import pandas as pd
import timeit
from matplotlib import pyplot as plt
def IMCoins(df):
df['Date'] = df.loc[:, 'Release Date'][:12] + ' ' + df['Time']
def petezurich(df):
df['Date'] = df['Release Date'].apply(lambda x: x[:12]) + ' ' + df['Time']
def benchmark(x_ticks, time_arr_1, time_arr_2):
""" Displays difference between all the time_arr.
"""
X = range(len(time_arr_1))
plt.figure()
plt.plot(X, time_arr_1, marker='o', color='g', label='IMCoins')
plt.plot(X, time_arr_2, marker='o', color='r', label='petezurich')
plt.ylabel('Time in seconds')
plt.xlabel('Number of elements to iterate on')
plt.xticks( [nb for nb in range(len(x_ticks))], x_ticks, rotation=30)
plt.legend()
plt.tight_layout()
plt.show()
if __name__ == '__main__':
# Iterations are the number of tests run by timeit.
n_iter = 10
# Elements modifies the shape of the DataFrame
n_elements = 10
# Number of time n_elements will get multiplied by factor.
n_increase = 7
factor = 10
time_arr_1, time_arr_2, x_ticks = [], [], []
for idx in range(n_increase):
# Preparing data inside the loop because we need to
# increase its size.
data = {
'Release Date' : ['a' * 20 for _ in range(n_elements)],
'Time' : ['b' * 10 for _ in range(n_elements)]
}
df = pd.DataFrame(data)
# We check the both functions are giving the same results.
assert IMCoins(df) == petezurich(df), 'results are different'
t1 = timeit.timeit(stmt = 'IMCoins(df)',
setup = 'from __main__ import df, IMCoins',
number= n_iter)
time_arr_1.append(t1)
t2 = timeit.timeit(stmt = 'petezurich(df)',
setup = 'from __main__ import df, petezurich',
number = n_iter)
time_arr_2.append(t2)
# We want to correctly display the number of elements computer on
# some later plots.
x_ticks.append(n_elements)
# In order to increase the data...
n_elements *= factorx
benchmark(x_ticks, time_arr_1, time_arr_2)