有趣的问题 - 我设计了一个小测试,以了解不同功能在速度和内存方面的比较。
library(dplyr)
library(rbenchmark)
library(data.table)
数据
dataList <- function(nobvs = 500, nelements = 2){
return(lapply(1:nelements, function(k){
return(data.table(year = rep(1980:1981, each = nobvs),
day = rnorm(2*nobvs)))
} ))
}
代码
比较函数
fnRbindlist <- function(ll = data_list){
return(rbindlist(l = ll, use.names = T, fill = T)[year == 1980])
}
fnBindRows <- function(ll = data_list){
return(ll %>% bind_rows() %>% filter(year == 1980))
}
fnPurr <- function(ll = data_list){
return(purrr::map_df(ll, ~ dplyr::filter(., year == 1980)))
}
fnSubsetRbind <- function(ll = data_list){
mm <- lapply(ll, function(k) return(k[year == 1980]))
return(rbindlist(mm, use.names = T, fill = T))
}
比较速度
speed_results <- lapply(c(100, 1000, 10000), function(q){
nobvs <- q
nelements <- 2
dat <- benchmark(fnRbindlist(dataList(nobvs, nelements)),
fnBindRows(dataList(nobvs, nelements)),
fnPurr(dataList(nobvs, nelements)),
fnSubsetRbind(dataList(nobvs, nelements)),
replications = 1000,
order = 'elapsed',
columns = c('test', 'replications', 'elapsed'))
dat$rank <- 1:nrow(dat)
dat$nobvs <- nobvs
dat$nelements <- nelements
return(dat)
})
比较内存使用情况
memory_results <- lapply(c(100, 1000, 10000), function(q){
nobvs <- q
nelements <- 2
dat <- data.table(memRbindlist = pryr::mem_change(fnRbindlist(dataList(nobvs, nelements))),
memBindRows = pryr::mem_change(fnBindRows(dataList(nobvs, nelements))),
memPurr = pryr::mem_change(fnPurr(dataList(nobvs, nelements))),
memSubsetRbind = pryr::mem_change(fnSubsetRbind(dataList(nobvs, nelements))))
dat$nobvs <- nobvs
dat$nelements <- nelements
return(dat)
})
结果
> rbindlist(l = speed_results, use.names = T, fill = T)[order(test)]
test replications elapsed rank nobvs nelements
1: fnBindRows(dataList(nobvs, nelements)) 1000 1.75 1 100 2
2: fnBindRows(dataList(nobvs, nelements)) 1000 2.23 1 1000 2
3: fnBindRows(dataList(nobvs, nelements)) 1000 6.95 1 10000 2
4: fnPurr(dataList(nobvs, nelements)) 1000 2.56 3 100 2
5: fnPurr(dataList(nobvs, nelements)) 1000 3.02 3 1000 2
6: fnPurr(dataList(nobvs, nelements)) 1000 8.89 4 10000 2
7: fnRbindlist(dataList(nobvs, nelements)) 1000 2.56 2 100 2
8: fnRbindlist(dataList(nobvs, nelements)) 1000 2.85 2 1000 2
9: fnRbindlist(dataList(nobvs, nelements)) 1000 8.17 2 10000 2
10: fnSubsetRbind(dataList(nobvs, nelements)) 1000 3.77 4 100 2
11: fnSubsetRbind(dataList(nobvs, nelements)) 1000 4.04 4 1000 2
12: fnSubsetRbind(dataList(nobvs, nelements)) 1000 8.77 3 10000 2
> rbindlist(l = memory_results, use.names = T, fill = T)
memRbindlist memBindRows memPurr memSubsetRbind nobvs nelements
1: -34944 69272 70104 220672 100 2
2: 51312 69272 70104 220672 1000 2
3: 51416 69272 70104 220672 10000 2
基于上述,我想说rbindlist 是最好的选择,因为它的速度始终如一并且内存使用量最少(OP 中的主要问题)。
希望对您有所帮助!