试试:
transform(df, same = c('No', 'Yes')[grepl("^(.)\\1*$", gsub(';', '', gradelist)) + 1])
输出:
id gradelist same
1 1 a;b;b No
2 2 c;c Yes
3 3 d;d;d;f No
4 4 f;f;f;f;f;f Yes
5 5 a;a;a;a Yes
6 6 f;b;b;b;b;b;b;b No
7 7 c;c;d;d;a;a No
你也可以走strsplit的方式,如下:
transform(df, same = c('No', 'Yes')[sapply(strsplit(gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1])
基准测试
我们重复字符串几次。我们还重复 df 的行,这样我们最终会得到略多于 100k 的行,并分配 @ThomasIsCoding 使用的函数。
df$gradelist <- sapply(df$gradelist, function(x) paste(replicate(20, x), collapse = ";"))
df <- df[rep(seq_len(nrow(df)), each = 15000), ]
f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))
我们对所有base 函数使用transform 来模拟mutate 在tidy 解决方案和microbenchmark 10 次的情况下的行为:
mBench <- microbenchmark::microbenchmark(
akrun1 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
~ c("no", "yes")[1+(n_distinct(.x)==1)])) },
akrun2 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no"))) },
akrun3 = { df %>%
separate_rows(gradelist) %>%
distinct %>%
group_by(id) %>%
summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>%
left_join(df) },
db = { transform(df, same = sapply(gradelist, function(x) {
nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0}, USE.NAMES = FALSE)) },
`M--` = { transform(df, same = factor(unlist(lapply(strsplit(gradelist, ";"), function(x) length(unique(x))))==1, labels=c("No", "Yes"))) },
ThomasIsCoding1 = { transform(df, same = f(gradelist)) },
ThomasIsCoding2 = { transform(df, same = sapply(regmatches(df$gradelist,gregexpr("\\w",df$gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no"))) },
arg0naut91_1 = { transform(df, same = c('No', 'Yes')[grepl("^(.)\\1*$", gsub(';', '', df$gradelist)) + 1]) },
arg0naut91_2 = { transform(df, same = c('No', 'Yes')[sapply(strsplit(df$gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1]) },
times = 10
)
结果:
Unit: seconds
expr min lq mean median uq max neval
akrun1 19.684781 19.912789 21.084244 20.646490 21.606763 24.008420 10
akrun2 30.393006 31.066965 32.590679 31.824528 33.567449 37.780535 10
akrun3 6.378463 7.190472 7.379439 7.373730 7.704365 8.321929 10
db 3.738271 3.785858 3.935769 3.911479 3.926385 4.523876 10
M-- 3.551592 3.648720 3.723315 3.741075 3.798664 3.915588 10
ThomasIsCoding1 4.453528 4.498858 4.702160 4.613088 4.823517 5.379984 10
ThomasIsCoding2 3.368358 3.532593 3.752111 3.610664 3.773345 4.969414 10
arg0naut91_1 1.638212 1.683986 1.699327 1.704614 1.716077 1.759059 10
arg0naut91_2 3.665604 3.739662 3.774542 3.750144 3.774753 4.071887 10
剧情: