OP 尝试了data.table 解决方案。在这里,我们受益于同时通过引用进行分组和更新。
library(data.table)
setDT(group)[, diff := max(pt) - pt, by = Subject][]
Subject pt diff
1: 1 2 3
2: 1 3 2
3: 1 5 0
4: 2 2 15
5: 2 5 12
6: 2 8 9
7: 2 17 0
8: 3 3 2
9: 3 5 0
数据
ID <- c(1,1,1,2,2,2,2,3,3)
Value <- c(2,3,5,2,5,8,17,3,5)
group <- data.frame(Subject=ID, pt=Value)
基准测试
在撰写本文时,发布了 5 个答案,包括 Frank's comment 关于 data.table 方法的效率。所以,我想知道这五个解决方案中哪一个最快。
- r2evans
- 我的
- Frank
- harelhan
- JonMinton
一些解决方案会修改 data.frame。为确保公平比较,另外,
OP 需要创建一个名为“diff”的新列。为了比较,所有结果都应返回包含三列的group。一些答案进行了相应的修改。 harelhan 的答案需要大量修改才能消除错误。
随着 group 的修改,所有基准测试运行都以带有两列的 group 的新副本开始。
基准是根据行数和组的份额参数化的,即,组的数量会随着问题的大小而变化,以便进行扩展。
library(data.table)
library(dplyr)
library(bench)
bm <- press(
# n_row = c(1E2, 1E4, 1E5, 1E6),
n_row = c(1E2, 1E4, 1E5),
grp_share = c(0.01, 0.1, 0.5, 0.9),
{
n_grp <- grp_share * n_row
set.seed(1)
group0 <- data.frame(
Subject = sample(n_grp, n_row, TRUE),
pt = as.numeric(rpois(n_row, 100)))
mark(
r2Evans = {
group <- copy(group0)
group <- group %>%
group_by(Subject) %>%
mutate(diff = max(pt) - pt)
group
},
Uwe = {
group <- copy(group0)
setDT(group)[, diff := max(pt) - pt, by = Subject]
group
},
Frank = {
group <- copy(group0)
setDT(group)[, mx := max(pt), by=Subject][, diff := mx - pt][, mx := NULL]
group
},
harelhan = {
group <- copy(group0)
max_group <- group %>% group_by(Subject) %>% summarize(max_val = max(pt))
group <- left_join(group, max_group[, c("Subject", "max_val")], by = "Subject")
group$diff <- group$max_val - group$pt
group <- group %>% select(-max_val)
group
},
JonMinton = {
group <- copy(group0)
group <- group %>%
group_by(Subject) %>%
mutate(max_group_val = max(pt)) %>%
ungroup() %>%
mutate(diff = max_group_val - pt) %>%
select(-max_group_val)
group
}
)
}
)
ggplot2::autoplot(bm)