这是使用pd.Series.value_counts 和pd.DataFrame.loc 的矢量化解决方案:
s = df['name'].value_counts(sort=False)
df.loc[df['name'].isin(s[s == 1].index), 'name'] = 'other'
print(df)
name pt
0 john 23
1 sam 32
2 john 45
3 john 65
4 dean 65
5 dean 45
6 maggi 32
7 other 45
8 maggi 90
9 other 10
10 sam 32
11 sam 22
性能基准测试
如果您有很多重复的名称,为了提高性能,您可以使用Categorical Data。以下是 Python 3.6、Pandas 0.19 的时间安排。
def jpp(df):
s = df['name'].value_counts(sort=False)
df.loc[df['name'].isin(s[s == 1].index), 'name'] = 'other'
return df
def jez(df):
df['name'] = np.where(df.groupby('name')['name'].transform('size') == 1, 'other', df['name'])
return df
def jon(df):
df['name'] = df['name'].apply(lambda name, counts=Counter(df['name']): name if counts[name] > 1 else 'other')
return df
assert jpp(df).equals(jez(df))
assert jpp(df).equals(jon(df))
%timeit jpp(df) # 49.4 ms per loop
%timeit jez(df) # 56.2 ms per loop
%timeit jon(df) # 274 ms per loop
设置
df = pd.DataFrame({'name': ['john','sam','john','john','dean','dean','maggi',
'ram','maggi','ana','sam','sam'],
'pt': [23, 32, 45, 65, 65, 45, 32, 45, 90, 10, 32, 22]})
df['name'] = df['name'].astype('category')
df['name'] = df['name'].cat.add_categories('other')
df = pd.concat([df, pd.concat([df.iloc[:5]]*100000)])