【发布时间】:2022-01-06 21:46:24
【问题描述】:
我在 R 工作。
我有以下5个数据集(data_1、data_2、data_3、data_4、data_5):
v1 <- c("2010-2011","2011-2012", "2012-2013", "2013-2014", "2014-2015")
v2 <- c("A", "B", "C", "D", "E")
data_1 = data.frame(var_1 = rnorm(871, 10,10), var_2 = rnorm(871, 5,5))
data_1$dates <- as.factor(sample(v1, 871, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_1$types <- as.factor(sample(v2, 871, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_2 = data.frame(var_1 = rnorm(412, 10,10), var_2 = rnorm(412, 5,5))
data_2$dates <- as.factor(sample(v1, 412, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_2$types <- as.factor(sample(v2, 412, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_3 = data.frame(var_1 = rnorm(332, 10,10), var_2 = rnorm(332, 5,5))
data_3$dates <- as.factor(sample(v1, 332, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_3$types <- as.factor(sample(v2, 332, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_4 = data.frame(var_1 = rnorm(611, 10,10), var_2 = rnorm(611, 5,5))
data_4$dates <- as.factor(sample(v1, 611, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_4$types <- as.factor(sample(v2, 611, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_5 = data.frame(var_1 = rnorm(789, 10,10), var_2 = rnorm(789, 5,5))
data_5$dates <- as.factor(sample(v1, 789, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_5$types <- as.factor(sample(v2, 789, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
# sample of one of the files
head(data_1)
var_1 var_2 dates types
1 8.523382 4.945344 2010-2011 E
2 14.137515 3.223525 2012-2013 A
3 19.610770 7.762698 2011-2012 D
4 11.334196 10.879946 2012-2013 E
5 -1.406475 2.498347 2011-2012 E
6 11.116458 9.988073 2011-2012 E
根据上面的数据,我做了一个提供汇总的表格:
summary_table = data.frame(names = c("data_1", "data_2", "data_3", "data_4", "data_5"),
counts = c(nrow(data_1), nrow(data_2), nrow(data_3), nrow(data_4), nrow(data_5) ),
mean_var_1 = c(mean(data_1$var_1), mean(data_2$var_1), mean(data_3$var_1), mean(data_4$var_1), mean(data_5$var_1)),
mean_var_2 = c(mean(data_2$var_1), mean(data_2$var_2), mean(data_3$var_2), mean(data_4$var_2), mean(data_5$var_2))
)
names counts mean_var_1 mean_var_2
1 data_1 871 9.426475 9.853399
2 data_2 412 9.853399 4.680188
3 data_3 332 10.275049 5.256084
4 data_4 611 10.094421 5.323108
5 data_5 789 9.960050 4.946458
我想在上面的表格中添加 5 个新列,其中包含每年的计数。这看起来像这样(这是一个空模板):
df <- data.frame(matrix(ncol = 7, nrow = 0))
x <- c("names", "counts", "counts 2010-2011", "counts 2011-2012", "counts 2012-2013", "counts 2013-2014", "counts 2014-2015")
colnames(df) <- x
我知道如何手动执行此操作,但需要很长时间:
library(dplyr)
summary_1 = data.frame( data_1 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_2 = data.frame( data_2 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_3 = data.frame( data_3 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_4 = data.frame( data_4 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_5 = data.frame( data_5 %>% group_by(dates) %>% summarise(my_counts = n()) )
#view sample of output
summary_1
dates my_counts
1 2010-2011 407
2 2011-2012 189
3 2012-2013 79
4 2013-2014 101
5 2014-2015 95
但我必须手动创建 5 个新列并手动复制这 25 个计数 (5 x 5 = 25)。
有人可以告诉我一个更快的方法吗?
谢谢!
所需输出示例
【问题讨论】:
标签: r dplyr count data-manipulation