我很好奇。两种提议的解决方案都提供了改进,np.where 解决方案是最快的。尽管这些差异都不是大问题,除非您多次执行此操作或拥有庞大的 DataFrame。
import perfplot
import pandas as pd
import numpy as np
def mask_both(df):
df.loc[df['var1'] == 'x', 'var2'] = df.loc[df['var1'] == 'x', 'var3']
return df['var2']
def mask_once(df):
m = df['var1'] == 'x'
df.loc[m, 'var2'] = df.loc[m, 'var3']
return df['var2']
def numpy_where(df):
df['var2'] = np.where(df['var1']=='x', df['var3'], df['var2'])
return df['var2']
perfplot.show(
setup=lambda N: pd.DataFrame({'var1': np.random.choice(['x', 'y'], N),
'var2': np.random.choice(range(100), N),
'var3': np.random.choice(range(100,200),N)}),
kernels=[
lambda df: mask_both(df),
lambda df: mask_once(df),
lambda df: numpy_where(df),
],
labels=['Mask Twice', 'Mask Once', 'Numpy Where'],
n_range=[2 ** k for k in range(2, 23)],
equality_check=np.allclose,
xlabel="len(df)"
)