.apply 与 .value_counts 和 .sum
-
col.value_counts().gt(1) 创建一个 Boolean 系列
-
True 的计算结果为 1,False 的计算结果为 0,因此 .sum() 产生正确的结果。
dupe_count = df.agg(lambda col: col.value_counts().gt(1).sum())
A 2
B 2
C 1
dtype: int64
for-loop
- 通常不建议迭代数据帧,尤其是逐行。但是,我们正在遍历列,然后应用向量化函数,这与
.apply 发生的情况相同。
def col_vc(df):
dupe_count = dict()
for col in df.columns:
dupe_count[col] = df[col].value_counts().gt(1).sum()
return dupe_count
col_vc(df)
[result]:
{'A': 2, 'B': 2, 'C': 1}
dupe_count = {col: df[col].value_counts().gt(1).sum() for col in df.columns}
[result]:
{'A': 2, 'B': 2, 'C': 1}
# to a dataframe if desired
dupe_count = pd.DataFrame.from_dict(dupe_count, orient='index')
0
A 2
B 2
C 1
%%timeit比较
import pandas as pd
import numpy as np
# sample data 5 columns by 1M rows
np.random.seed(365)
rows = 1000000
data = {'a': np.random.randint(0, 10000, size=(rows)),
'b': np.random.randint(15, 25000, size=(rows)),
'c': np.random.randint(30, 40000, size=(rows)),
'd': np.random.randint(450, 550000, size=(rows)),
'e': np.random.randint(6000, 70000, size=(rows))}
df = pd.DataFrame(data)
-
.apply 与 .value_counts 和 .sum
%%timeit
df.agg(lambda x: x.value_counts().gt(1).sum())
[out]:
112 ms ± 1.67 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit
{col: df[col].value_counts().gt(1).sum() for col in df.columns}
[out]:
111 ms ± 983 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit
col_vc(df)
[out]:
115 ms ± 4.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit
df.agg(lambda x: sum(x.value_counts() > 1))
[out]:
194 ms ± 17.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)