【问题标题】：Summarize values by group, but keep original data按组汇总值，但保留原始数据
【发布时间】：2019-11-19 11:38:29
【问题描述】：

我试图弄清楚如何将属于category a 和b 的values 与file 相加，但也要保留原始数据。

library(dplyr)
df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:5, 4)))) 


   ID     values category file
1   1 0.65699229        a    1
2   2 0.70506478        b    1
3   3 0.45774178        c    1
4   4 0.71911225        d    1
5   5 0.93467225        e    1
6   6 0.25542882        a    2
7   7 0.46229282        b    2
8   8 0.94001452        c    2
9   9 0.97822643        d    2
10 10 0.11748736        e    2
11 11 0.47499708        a    3
12 12 0.56033275        b    3
13 13 0.90403139        c    3
14 14 0.13871017        d    3
15 15 0.98889173        e    3
16 16 0.94666823        a    4
17 17 0.08243756        b    4
18 18 0.51421178        c    4
19 19 0.39020347        d    4
20 20 0.90573813        e    4

这样

df[1,2] 将添加到 df[2,2] 到类别 'ab' 的文件 1
df[6,2] 将添加到 df[7,2] 到类别 'ab' 用于文件 2
等。

到目前为止，我有这个：

df %>% 
    filter(category %in% c('a' , 'b')) %>%
    group_by(file) %>% 
    summarise(values = sum(values))

问题

我想将求和值的类别更改为“ab”，并将其附加到同一管道中的原始数据帧中。

期望的输出：

   ID     values category file
1   1 0.65699229        a    1
2   2 0.70506478        b    1
3   3 0.45774178        c    1
4   4 0.71911225        d    1
5   5 0.93467225        e    1
6   6 0.25542882        a    2
7   7 0.46229282        b    2
8   8 0.94001452        c    2
9   9 0.97822643        d    2
10 10 0.11748736        e    2
11 11 0.47499708        a    3
12 12 0.56033275        b    3
13 13 0.90403139        c    3
14 14 0.13871017        d    3
15 15 0.98889173        e    3
16 16 0.94666823        a    4
17 17 0.08243756        b    4
18 18 0.51421178        c    4
19 19 0.39020347        d    4
20 20 0.90573813        e    4
21 21 1.25486225       ab    1
22 22 1.87216325       ab    2
23 23 1.36548126       ab    3

【问题讨论】：

也许 setDT(df)[category %chin% c('a','b'), summed:=sum(values), file]
@chinsoon12 %chin% 实际上在这里不起作用（假设默认 stringsAsFactor 选项），因为类别是一个因素而不是字符。不过，您应该将此作为答案发布
从电话评论，没有测试所以最好不要发布

标签： r dplyr

【解决方案1】：

这将为您提供结果

df %>% bind_rows(
  df %>% 
    filter(category %in% c('a' , 'b')) %>%
    group_by(file) %>% 
    mutate(values = sum(values), category = paste0(category,collapse='')) %>% 
    filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())

顺便说一句，代码专家在示例中生成的数据框是这个：

df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:4, 5))))

现在假设您想对多列求和，您需要在向量中提供列表：

cols = c("values") # columns to be sum

df %>% bind_rows(
  df %>% 
    filter(category %in% c('a' , 'b')) %>%
    group_by(file) %>% 
    mutate_at(vars(cols), sum) %>% 
    mutate(category = paste0(category,collapse='')) %>% 
    filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())

【讨论】：

加 1 用于通过粘贴折叠自动创建新名称
如果您想对一整套列进行求和，您知道如何进行求和吗？我的真实数据有 80 个参数列。目前我只需要 2 列中的值，这可以手动管理，但我想让代码以自动方式为列的 flexibel nr 工作
编辑了代码，以便您可以提供要求和的列向量
好的，成功地编写了允许计算合并数据集（多变量分析中的集群）的新总数、最小值和最大值的代码。现在试图弄清楚如何正确地重新计算平均值。为此发布一个新问题
分享链接 ;)

【解决方案2】：

library(dplyr)

df1 %>% 
  filter(category %in% c('a' , 'b')) %>%
  group_by(file) %>%
  filter(n_distinct(category) > 1) %>% 
  summarise(values = sum(values)) %>%
  mutate(category="ab",
         ID=max(df1$ID)+1:n())  %>% 
  bind_rows(df1, .)
#> Warning in bind_rows_(x, .id): binding factor and character vector,
#> coercing into character vector
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
#>    ID     values category file
#> 1   1 0.62585921        a    1
#> 2   2 0.61865851        b    1
#> 3   3 0.05274456        c    1
#> 4   4 0.68156961        d    1
.
.
.
#> 19 19 0.43239411        d    5
#> 20 20 0.85886314        e    5
#> 21 21 1.24451773       ab    1
#> 22 22 0.99001810       ab    2
#> 23 23 1.25331943       ab    3

【讨论】：

【解决方案3】：

这种data.table 方法使用自连接来获取所有可能的两个字符组合。

library(data.table)
setDT(df)

df_self_join <- df[df, on = .(file), allow.cartesian = T
                   ][category != i.category,
                     .(category = paste0(i.category, category), values = values + i.values, file)
                     ][order(category), .(ID = .I + nrow(df), values, category, file)]

rbindlist(list(df, df_self_join))

    ID     values category file
 1:  1 0.76984382        a    1
 2:  2 0.54311583        b    1
 3:  3 0.23462016        c    1
 4:  4 0.60179043        d    1
...
20: 20 0.03534223        e    5
21: 21 1.31295965       ab    1
22: 22 0.51666175       ab    2
23: 23 1.02305754       ab    3
24: 24 1.00446399       ac    1
25: 25 0.96910373       ac    2
26: 26 0.87795389       ac    4
#total of 80 rows

这里很接近dplyr翻译：

library(dplyr)
tib <- as_tibble(df)

inner_join(tib, tib, by = 'file')%>%
  filter(ID.x != ID.y)%>%
  transmute(category = paste0(category.x, category.y)
            , values = values.x + values.y
            , file)%>%
  arrange(category)%>%
  bind_rows(tib, .)%>%
  mutate(ID = row_number())%>%
  filter(category == 'ab') #filter added to show the "ab" files

# A tibble: 3 x 4
     ID values category file 
  <int>  <dbl> <chr>    <fct>
1    21  1.31  ab       1    
2    22  0.517 ab       2    
3    23  1.02  ab       3

【讨论】：