这是基本 R 解决方案,比其他答案快 5-94 倍。
您可以使用这样的功能,即使您更改最内层列表的名称也可以使用:
# turns a deep nested list to a data.frame.
#
# Args:
# x: list of nested lists. All needs to have identical setup and
# names.
# cnam: character vector with column names for the columns which are
# from the non-terminal lists.
deep_nested_to_df <- function(x, cnam)
.deep_nested_to_df(x, cnam)
# do not call this function
.deep_nested_to_df <- function(x, cnam, idx = 1L){
# check if all elements are lists
is_all_lists <- all(sapply(x, is.list))
if(is_all_lists){
# create data.frames out of elements
out <- lapply(x, .deep_nested_to_df, cnam = cnam, idx = idx + 1L)
# check that all column names match
my_cnam <- colnames(out[[1L]])
stopifnot(all(length(out[[1L]]) == sapply(out, length)),
all(sapply(out, function(x) all(colnames(x) == my_cnam))))
# create the new colum
new_col <- c(mapply(rep, x = names(x), times = sapply(out, NROW)))
# combine to one data.frame
out <- do.call(rbind, out)
# add the new column
out <- do.call(cbind, list(
as.data.frame(new_col, stringsAsFactors = FALSE), out))
colnames(out)[1L] <- cnam[idx]
if(idx == 1L)
rownames(out) <- 1:NROW(out)
return(out)
}
as.data.frame(x, stringsAsFactors = FALSE)
}
# use the function
res <- deep_nested_to_df(ls, c("time", "seed", "treatment"))
head(res, 16)
#R> time seed treatment Gmax.val G2.val Gmax.vec G2.vec
#R> 1 10 123 0.1 -0.63 0.18 -0.836 1.512
#R> 2 10 123 0.1 -0.63 0.18 1.595 0.390
#R> 3 10 123 0.1 -0.63 0.18 0.330 -0.621
#R> 4 10 123 0.1 -0.63 0.18 -0.820 -2.215
#R> 5 10 123 0.1 -0.63 0.18 0.487 1.125
#R> 6 10 123 0.1 -0.63 0.18 0.738 -0.045
#R> 7 10 123 0.1 -0.63 0.18 0.576 -0.016
#R> 8 10 123 0.1 -0.63 0.18 -0.305 0.944
#R> 9 10 123 0.2 0.82 0.59 0.919 -0.478
#R> 10 10 123 0.2 0.82 0.59 0.782 0.418
#R> 11 10 123 0.2 0.82 0.59 0.075 1.359
#R> 12 10 123 0.2 0.82 0.59 -1.989 -0.103
#R> 13 10 123 0.2 0.82 0.59 0.620 0.388
#R> 14 10 123 0.2 0.82 0.59 -0.056 -0.054
#R> 15 10 123 0.2 0.82 0.59 -0.156 -1.377
#R> 16 10 123 0.2 0.82 0.59 -1.471 -0.415
str(res)
#R> 'data.frame': 64 obs. of 7 variables:
#R> $ time : chr "10" "10" "10" "10" ...
#R> $ seed : chr "123" "123" "123" "123" ...
#R> $ treatment: chr "0.1" "0.1" "0.1" "0.1" ...
#R> $ Gmax.val : num -0.626 -0.626 -0.626 -0.626 -0.626 ...
#R> $ G2.val : num 0.184 0.184 0.184 0.184 0.184 ...
#R> $ Gmax.vec : num -0.836 1.595 0.33 -0.82 0.487 ...
#R> $ G2.vec : num 1.512 0.39 -0.621 -2.215 1.125 ...
该功能可能会更快。我怀疑对as.data.frame 和rbind 的调用会大大减慢函数的运行速度。尽管如此,它仍然有效。
更快的版本
更快的版本类似于:
deep_nested_to_df_fast <- function(x, cnam)
.deep_nested_to_df_fast(x, cnam)
.deep_nested_to_df_fast <- function(x, cnam, idx = 1L){
# check if all elements are list
is_all_lists <- all(sapply(x, is.list))
if(is_all_lists){
# create data.frames out of elements
out <- lapply(x, .deep_nested_to_df_fast, cnam = cnam, idx = idx + 1L)
# check that all column names match
my_cnam <- colnames(out[[1L]])
stopifnot(all(length(out[[1L]]) == sapply(out, length)),
all(sapply(out, function(x) all(colnames(x) == my_cnam))))
# create the new colum
new_col <- mapply(
rep, x = names(x), times = sapply(out, function(x) length(x[[1L]])),
SIMPLIFY = FALSE)
new_col <- do.call(c, new_col)
# combine to a list instead of a data.frame
out <- do.call(mapply, c(list(FUN = c, SIMPLIFY = FALSE), out))
# add the new colum
out <- c(list(new_col), out)
names(out)[1L] <- cnam[idx]
if(idx == 1L)
# turn it into a data.frame
out <- structure(
lapply(out, unname), names = names(out),
row.names = 1:length(out[[1L]]), class = "data.frame")
return(out)
}
# create list of element with an equal number
ele_length <- sapply(x, length)
# check that all have either the maximum number of elements or one
# element
max_len <- max(ele_length)
stopifnot(all(ele_length %in% c(1L, max_len)))
# return list like data.frame
lapply(x, rep, length.out = max_len)
}
在这种情况下,它给出了相同的结果并且速度更快:
# gives the same
res_fast <- deep_nested_to_df_fast(ls, c("time", "seed", "treatment"))
all.equal(res, res_fast)
#R> [1] TRUE
# check computation time. We also compare with other answers
library(rrapply)
library(tidyverse)
library(data.table)
Joris_ans <- function()
rrapply(ls, how = "melt") %>%
pivot_wider(names_from = "L4") %>%
unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%
rename(time = L1, seed = L2, treatment = L3)
Andrew_ans <- function(data = ls)
tibble(time = names(data), data = data) %>%
unnest_wider(data) %>%
pivot_longer(-time, names_to = "seed", values_to = "treatment") %>%
unnest_wider(treatment) %>%
pivot_longer(-c(time, seed), names_to = "treatment", values_to = "g_data") %>%
unnest_wider(g_data) %>%
mutate(row_n = row_number()) %>%
pivot_longer(c(Gmax.vec, G2.vec), names_to = "g", values_to = "g_val") %>%
unnest_longer(g_val) %>%
group_by(row_n, time, seed, treatment, Gmax.val, G2.val, g) %>%
mutate(sub_n = row_number()) %>%
pivot_wider(names_from = g, values_from = g_val) %>%
ungroup() %>%
select(-row_n, -sub_n)
hello_friend_ans <- function(){
flat_long_df <- stack(data.frame(do.call("c", lapply(Map(function(x){
do.call(c, x)}, ls), data.frame))))
long_df <- cbind(within(flat_long_df, rm(ind)),
do.call("rbind", lapply(strsplit(as.character(flat_long_df$ind), "\\."), function(x){
data.frame(cbind(time = as.numeric(gsub("X", "", x[1])),
seed = as.numeric(gsub("X", "", x[2])),
treatment = as.numeric(paste(x[3], x[4], sep = ".")),
var = paste(x[5], x[6], sep = ".")))
}
)
)
)
wide_df <- setNames(reshape(long_df,
idvar= c("time", "seed", "treatment"), timevar="var", direction="wide"),
c("time", "seed", "treatment", "Gmax.val", "G2.val", "Gmax.vec", "G2.vec"))
wide_df
}
det_ans <- function(){
nested_list <- function(ls, list_names = NULL){
if(all(map_chr(ls, class) %in% c("numeric", "character", "integer", "logical"))){
dt <- as.data.table(ls)
dt[, (str_c("NAME_", seq_along(list_names))) := as.list(list_names)]
return(dt)
}
ls %>% imap(~nested_list(.x, c(list_names, .y)))
}
NestedList <- function(ls, new_names){
dt <- nested_list(ls) %>%
{do.call(c, unlist(., recursive = FALSE))} %>%
rbindlist()
setnames(dt, str_subset(names(dt), "NAME_"), new_names)
dt
}
NestedList(ls, c("time", "seed", "tretment"))
}
bench::mark(
first = deep_nested_to_df (ls, c("time", "seed", "treatment")),
fast = deep_nested_to_df_fast(ls, c("time", "seed", "treatment")),
Joris_ans(), Andrew_ans(), hello_friend_ans(), det_ans(),
min_time = 1, check = FALSE, relative = TRUE)
#R> # A tibble: 6 x 13
#R> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
#R> <bch:expr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <bch:tm> <list> <list> <lis> <lis>
#R> 1 first 4.44 4.39 20.1 1.19 1 207 7 925.3ms <NULL> <Rpro… <bch… <tib…
#R> 2 fast 1 1 89.2 1 1.28 923 9 930.3ms <NULL> <Rpro… <bch… <tib…
#R> 3 Joris_ans() 9.90 9.85 9.01 1.92 1.03 90 7 897.9ms <NULL> <Rpro… <bch… <tib…
#R> 4 Andrew_ans() 60.2 59.1 1.62 10.9 1.51 11 7 611.5ms <NULL> <Rpro… <bch… <tib…
#R> 5 hello_friend_ans() 103. 94.9 1 203. 14.7 1 10 89.9ms <NULL> <Rpro… <bch… <tib…
#R> 6 det_ans() 5.87 5.73 15.4 106. 1.01 157 7 914ms <NULL> <Rpro… <bch… <tib…
比第一个版本快 4 倍,比其他答案快 5-94 倍。
我们可以通过使用预先分配我们将写入的最终向量的 C++/C 实现来加快速度。这样,我们就避免了上面重复的内存分配。
数据
set.seed(1L)
ls <- list('10' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8))),
'456' = list ('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)))),
'20' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8))),
'456' = list ('0.1' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)),
'0.2' = list(Gmax.val = rnorm(1),
G2.val = rnorm(1),
Gmax.vec = rnorm(8),
G2.vec = rnorm(8)))))
您可能想要使用除 ls 之外的另一个变量名,因为有一个 ls 函数。