将深度嵌套列表转换为数据框答案

【问题标题】：Converting a deeply nested list to a dataframe将深度嵌套列表转换为数据框
【发布时间】：2020-09-15 05:16:27
【问题描述】：

我有一个深度嵌套的列表，我想将其转换为数据框。结构如下：

ls <- list('10' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8)),
                                    '0.2' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8))),
                       '456' = list ('0.1' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)),
                                     '0.2' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)))),
           '20' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8)),
                                    '0.2' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8))),
                       '456' = list ('0.1' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)),
                                     '0.2' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)))))


> ls[['10']][['123']][['0.1']]
$Gmax.val
[1] -0.1982298

$G2.val
[1] -0.2761515

$Gmax.vec
[1] -0.4732736 -0.5714809 -0.1600405 -0.7138532  0.3503852 -0.7367241  0.3024992 -0.4931045

$G2.vec
[1] -0.2374231 -0.7927135 -0.9554769  0.8733201 -0.4126742  1.8689940  0.1576750 -0.2184344

每个子列表名称都是不同变量的值：在本例中，可能是：

ls[[]] = time; 10 or 20
ls[[]][[]] = seed; 123 or 456
ls[[]][[]][[]] = treatment; 0.1 or 0.2

理想情况下，我希望将子列表的名称用作它们自己列中的值。我希望数据框看起来像这样：

#  time seed treatment  Gmax.val     G2.val    Gmax.vec     G2.vec
#1   10  123       0.1 0.1972457 -0.1224265  0.06121407  1.5102516
#2   10  123       0.1 0.1972457 -0.1224265 -2.53026477 -0.1320042
#3   10  123       0.1 0.1972457 -0.1224265  0.06648820 -0.2477285
#4   10  123       0.1 0.1972457 -0.1224265 -0.45594701 -0.8577670
#5   10  123       0.1 0.1972457 -0.1224265  0.90828911 -1.0710828
#6   10  123       0.1 0.1972457 -0.1224265  0.56427976  1.5086222

感谢您的帮助。

【问题讨论】：

这里有什么问题？你有没有尝试过这种渴望的陈述？这是如何获得 8 票的？请参阅how to ask。

标签： r dataframe nested-lists

【解决方案1】：

另一种方法是：

在 rrapply-package 中使用 rrapply() 将嵌套列表融合到 data.frame（或使用 reshape2::melt() 类似）。
使用 tidyr 的 pivot_wider() 和 unnest() 将 data.frame 重塑为所需的格式。

library(rrapply)
library(tidyverse)

rrapply(ls, how = "melt") %>%                            ## melt to long df
  pivot_wider(names_from = "L4") %>%                     ## reshape to wide df
  unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%      ## unnest list columns
  rename(time = L1, seed = L2, treatment = L3)           ## rename columns

#> # A tibble: 64 x 7
#>    time  seed  treatment Gmax.val G2.val Gmax.vec  G2.vec
#>    <chr> <chr> <chr>        <dbl>  <dbl>    <dbl>   <dbl>
#>  1 10    123   0.1         -0.626  0.184   -0.836  1.51  
#>  2 10    123   0.1         -0.626  0.184    1.60   0.390 
#>  3 10    123   0.1         -0.626  0.184    0.330 -0.621 
#>  4 10    123   0.1         -0.626  0.184   -0.820 -2.21  
#>  5 10    123   0.1         -0.626  0.184    0.487  1.12  
#>  6 10    123   0.1         -0.626  0.184    0.738 -0.0449
#>  7 10    123   0.1         -0.626  0.184    0.576 -0.0162
#>  8 10    123   0.1         -0.626  0.184   -0.305  0.944 
#>  9 10    123   0.2          0.821  0.594    0.919 -0.478 
#> 10 10    123   0.2          0.821  0.594    0.782  0.418 
#> # … with 54 more rows

或者使用data.table的dcast()将长表重塑为宽格式：

library(data.table)

long_dt <- as.data.table(rrapply(ls, how = "melt"))
wide_dt <- dcast(long_dt, L1 + L2 + L3 ~ L4)
wide_dt <- wide_dt[, lapply(.SD, unlist), by = list(L1, L2, L3), .SDcols = c("Gmax.val", "G2.val", "Gmax.vec", "G2.vec")]
setnames(wide_dt, old = c("L1", "L2", "L3"), new = c("time", "seed", "treatment"))

一些基准测试

microbenchmark::microbenchmark(
  tidyr = {
    rrapply(ls, how = "melt") %>%                            
      pivot_wider(names_from = "L4") %>%                     
      unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%      
      rename(time = L1, seed = L2, treatment = L3)
  },
  data.table = {
    wide_dt <- dcast(as.data.table(rrapply(ls, how = "melt")), L1 + L2 + L3 ~ L4)
    wide_dt <- wide_dt[, lapply(.SD, unlist), by = list(L1, L2, L3), .SDcols = c("Gmax.val", "G2.val", "Gmax.vec", "G2.vec")]
    setnames(wide_dt, old = c("L1", "L2", "L3"), new = c("time", "seed", "treatment"))
    wide_dt
  },
  times = 25
)
#> Unit: milliseconds
#>        expr       min        lq      mean    median        uq       max neval
#>       tidyr 17.959197 20.072647 23.662698 21.278771 25.633581 40.593022    25
#>  data.table  2.061861  2.655782  2.966581  2.784425  2.988044  5.032524    25

【讨论】：

rrapply 很棒，感谢您在这里展示。

【解决方案2】：

这是一个使用tidyr 中一些较新的"rectangling" 方法的解决方案。我发布这篇文章主要是为了获得和分享对这些功能的一些熟悉感——我的感觉是，这种方法肯定可以，嗯，整理一下。尽管如此，这是在宽/长列表拆包之间来回摆动的好方法。

library(tidyverse)
set.seed(1L)

tibble(time = names(data), data = data) %>%
  unnest_wider(data) %>%
  pivot_longer(-time, names_to = "seed", values_to = "treatment") %>%
  unnest_wider(treatment) %>%
  pivot_longer(-c(time, seed), names_to = "treatment", values_to = "g_data") %>%
  unnest_wider(g_data) %>%
  mutate(row_n = row_number()) %>%
  pivot_longer(c(Gmax.vec, G2.vec), names_to = "g", values_to = "g_val") %>%
  unnest_longer(g_val) %>%
  group_by(row_n, time, seed, treatment, Gmax.val, G2.val, g) %>%
  mutate(sub_n = row_number()) %>%
  pivot_wider(names_from = g, values_from = g_val) %>%
  ungroup() %>%
  select(-row_n, -sub_n) 

  # A tibble: 64 x 7
   time  seed  treatment Gmax.val G2.val Gmax.vec  G2.vec
   <chr> <chr> <chr>        <dbl>  <dbl>    <dbl>   <dbl>
 1 10    123   0.1         -0.626  0.184   -0.836  1.51  
 2 10    123   0.1         -0.626  0.184    1.60   0.390 
 3 10    123   0.1         -0.626  0.184    0.330 -0.621 
 4 10    123   0.1         -0.626  0.184   -0.820 -2.21  
 5 10    123   0.1         -0.626  0.184    0.487  1.12  
 6 10    123   0.1         -0.626  0.184    0.738 -0.0449
 7 10    123   0.1         -0.626  0.184    0.576 -0.0162
 8 10    123   0.1         -0.626  0.184   -0.305  0.944 
 9 10    123   0.2          0.821  0.594    0.919 -0.478 
10 10    123   0.2          0.821  0.594    0.782  0.418 
# … with 54 more rows

【讨论】：

【解决方案3】：

这是基本 R 解决方案，比其他答案快 5-94 倍。

您可以使用这样的功能，即使您更改最内层列表的名称也可以使用：

# turns a deep nested list to a data.frame.
#
# Args:
#   x: list of nested lists. All needs to have identical setup and
#      names.
#   cnam: character vector with column names for the columns which are
#         from the non-terminal lists.
deep_nested_to_df <- function(x, cnam)
  .deep_nested_to_df(x, cnam)

# do not call this function
.deep_nested_to_df <- function(x, cnam, idx = 1L){
  # check if all elements are lists
  is_all_lists <- all(sapply(x, is.list))

  if(is_all_lists){
    # create data.frames out of elements
    out <- lapply(x, .deep_nested_to_df, cnam = cnam, idx = idx + 1L)

    # check that all column names match
    my_cnam <- colnames(out[[1L]])
    stopifnot(all(length(out[[1L]]) == sapply(out, length)),
              all(sapply(out, function(x) all(colnames(x) == my_cnam))))

    # create the new colum
    new_col <- c(mapply(rep, x = names(x), times = sapply(out, NROW)))

    # combine to one data.frame
    out <- do.call(rbind, out)

    # add the new column
    out <- do.call(cbind, list(
      as.data.frame(new_col, stringsAsFactors = FALSE), out))
    colnames(out)[1L] <- cnam[idx]
    if(idx == 1L)
      rownames(out) <- 1:NROW(out)
    return(out)
  }

  as.data.frame(x, stringsAsFactors = FALSE)
}

# use the function
res <- deep_nested_to_df(ls, c("time", "seed", "treatment"))
head(res, 16)
#R>    time seed treatment Gmax.val G2.val Gmax.vec G2.vec
#R> 1    10  123       0.1    -0.63   0.18   -0.836  1.512
#R> 2    10  123       0.1    -0.63   0.18    1.595  0.390
#R> 3    10  123       0.1    -0.63   0.18    0.330 -0.621
#R> 4    10  123       0.1    -0.63   0.18   -0.820 -2.215
#R> 5    10  123       0.1    -0.63   0.18    0.487  1.125
#R> 6    10  123       0.1    -0.63   0.18    0.738 -0.045
#R> 7    10  123       0.1    -0.63   0.18    0.576 -0.016
#R> 8    10  123       0.1    -0.63   0.18   -0.305  0.944
#R> 9    10  123       0.2     0.82   0.59    0.919 -0.478
#R> 10   10  123       0.2     0.82   0.59    0.782  0.418
#R> 11   10  123       0.2     0.82   0.59    0.075  1.359
#R> 12   10  123       0.2     0.82   0.59   -1.989 -0.103
#R> 13   10  123       0.2     0.82   0.59    0.620  0.388
#R> 14   10  123       0.2     0.82   0.59   -0.056 -0.054
#R> 15   10  123       0.2     0.82   0.59   -0.156 -1.377
#R> 16   10  123       0.2     0.82   0.59   -1.471 -0.415
str(res)
#R> 'data.frame':   64 obs. of  7 variables:
#R>  $ time     : chr  "10" "10" "10" "10" ...
#R>  $ seed     : chr  "123" "123" "123" "123" ...
#R>  $ treatment: chr  "0.1" "0.1" "0.1" "0.1" ...
#R>  $ Gmax.val : num  -0.626 -0.626 -0.626 -0.626 -0.626 ...
#R>  $ G2.val   : num  0.184 0.184 0.184 0.184 0.184 ...
#R>  $ Gmax.vec : num  -0.836 1.595 0.33 -0.82 0.487 ...
#R>  $ G2.vec   : num  1.512 0.39 -0.621 -2.215 1.125 ...

该功能可能会更快。我怀疑对as.data.frame 和rbind 的调用会大大减慢函数的运行速度。尽管如此，它仍然有效。

更快的版本

更快的版本类似于：

deep_nested_to_df_fast <- function(x, cnam)
  .deep_nested_to_df_fast(x, cnam)

.deep_nested_to_df_fast <- function(x, cnam, idx = 1L){
  # check if all elements are list
  is_all_lists <- all(sapply(x, is.list))

  if(is_all_lists){
    # create data.frames out of elements
    out <- lapply(x, .deep_nested_to_df_fast, cnam = cnam, idx = idx + 1L)

    # check that all column names match
    my_cnam <- colnames(out[[1L]])
    stopifnot(all(length(out[[1L]]) == sapply(out, length)),
              all(sapply(out, function(x) all(colnames(x) == my_cnam))))

    # create the new colum
    new_col <- mapply(
      rep, x = names(x), times = sapply(out, function(x) length(x[[1L]])),
      SIMPLIFY = FALSE)
    new_col <- do.call(c, new_col)

    # combine to a list instead of a data.frame
    out <- do.call(mapply, c(list(FUN = c, SIMPLIFY = FALSE), out))

    # add the new colum
    out <- c(list(new_col), out)
    names(out)[1L] <- cnam[idx]

    if(idx == 1L)
      # turn it into a data.frame
      out <- structure(
        lapply(out, unname), names = names(out),
        row.names = 1:length(out[[1L]]), class = "data.frame")

    return(out)
  }

  # create list of element with an equal number
  ele_length <- sapply(x, length)

  # check that all have either the maximum number of elements or one
  # element
  max_len <- max(ele_length)
  stopifnot(all(ele_length %in% c(1L, max_len)))

  # return list like data.frame
  lapply(x, rep, length.out = max_len)
}

在这种情况下，它给出了相同的结果并且速度更快：

# gives the same
res_fast <- deep_nested_to_df_fast(ls, c("time", "seed", "treatment"))
all.equal(res, res_fast)
#R> [1] TRUE

# check computation time. We also compare with other answers
library(rrapply)
library(tidyverse)
library(data.table)

Joris_ans <- function()
  rrapply(ls, how = "melt") %>%
    pivot_wider(names_from = "L4") %>%
    unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%
    rename(time = L1, seed = L2, treatment = L3)

Andrew_ans <- function(data = ls)
  tibble(time = names(data), data = data) %>%
    unnest_wider(data) %>%
    pivot_longer(-time, names_to = "seed", values_to = "treatment") %>%
    unnest_wider(treatment) %>%
    pivot_longer(-c(time, seed), names_to = "treatment", values_to = "g_data") %>%
    unnest_wider(g_data) %>%
    mutate(row_n = row_number()) %>%
    pivot_longer(c(Gmax.vec, G2.vec), names_to = "g", values_to = "g_val") %>%
    unnest_longer(g_val) %>%
    group_by(row_n, time, seed, treatment, Gmax.val, G2.val, g) %>%
    mutate(sub_n = row_number()) %>%
    pivot_wider(names_from = g, values_from = g_val) %>%
    ungroup() %>%
    select(-row_n, -sub_n)

hello_friend_ans <- function(){
  flat_long_df <- stack(data.frame(do.call("c", lapply(Map(function(x){
    do.call(c, x)}, ls), data.frame))))
  
  long_df <- cbind(within(flat_long_df, rm(ind)), 
                   do.call("rbind", lapply(strsplit(as.character(flat_long_df$ind), "\\."), function(x){
                     data.frame(cbind(time = as.numeric(gsub("X", "", x[1])),
                                      seed = as.numeric(gsub("X", "", x[2])),
                                      treatment = as.numeric(paste(x[3], x[4], sep = ".")),
                                      var = paste(x[5], x[6], sep = ".")))
                   }
                   )
                   )
  )
  
  wide_df <- setNames(reshape(long_df, 
                              idvar= c("time", "seed", "treatment"), timevar="var", direction="wide"),
                      c("time", "seed", "treatment", "Gmax.val", "G2.val", "Gmax.vec", "G2.vec"))
  wide_df
}

det_ans <- function(){
  nested_list <- function(ls, list_names = NULL){
    
    if(all(map_chr(ls, class) %in% c("numeric", "character", "integer", "logical"))){
      
      dt <- as.data.table(ls)
      dt[, (str_c("NAME_", seq_along(list_names))) := as.list(list_names)]
      
      return(dt)
    }
    
    ls %>% imap(~nested_list(.x, c(list_names, .y)))
  }
  
  NestedList <- function(ls, new_names){
    
    dt <- nested_list(ls) %>%
      {do.call(c, unlist(., recursive = FALSE))} %>%
      rbindlist() 
    
    setnames(dt, str_subset(names(dt), "NAME_"), new_names)
    dt
  }
  
  NestedList(ls, c("time", "seed", "tretment"))
}


bench::mark(
  first = deep_nested_to_df     (ls, c("time", "seed", "treatment")),
  fast  = deep_nested_to_df_fast(ls, c("time", "seed", "treatment")),
  Joris_ans(), Andrew_ans(), hello_friend_ans(), det_ans(),
  min_time = 1, check = FALSE, relative = TRUE)
#R> # A tibble: 6 x 13
#R>   expression            min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory time  gc   
#R>   <bch:expr>          <dbl>  <dbl>     <dbl>     <dbl>    <dbl> <int> <dbl>   <bch:tm> <list> <list> <lis> <lis>
#R> 1 first                4.44   4.39     20.1       1.19     1      207     7    925.3ms <NULL> <Rpro… <bch… <tib…
#R> 2 fast                 1      1        89.2       1        1.28   923     9    930.3ms <NULL> <Rpro… <bch… <tib…
#R> 3 Joris_ans()          9.90   9.85      9.01      1.92     1.03    90     7    897.9ms <NULL> <Rpro… <bch… <tib…
#R> 4 Andrew_ans()        60.2   59.1       1.62     10.9      1.51    11     7    611.5ms <NULL> <Rpro… <bch… <tib…
#R> 5 hello_friend_ans() 103.    94.9       1       203.      14.7      1    10     89.9ms <NULL> <Rpro… <bch… <tib…
#R> 6 det_ans()            5.87   5.73     15.4     106.       1.01   157     7      914ms <NULL> <Rpro… <bch… <tib…

比第一个版本快 4 倍，比其他答案快 5-94 倍。

我们可以通过使用预先分配我们将写入的最终向量的 C++/C 实现来加快速度。这样，我们就避免了上面重复的内存分配。

数据

set.seed(1L)
ls <- list('10' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8)),
                                    '0.2' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8))),
                       '456' = list ('0.1' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)),
                                     '0.2' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)))),
           '20' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8)),
                                    '0.2' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8))),
                       '456' = list ('0.1' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)),
                                     '0.2' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)))))

您可能想要使用除 ls 之外的另一个变量名，因为有一个 ls 函数。

【讨论】：

【解决方案4】：

基础 R 解决方案：

# Flatten the nested list: flat_long_df => data.frame
flat_long_df <- stack(data.frame(do.call("c", lapply(Map(function(x){
  do.call(c, x)}, ls), data.frame))))

# Derive the features: data.frame => console (stdout)
long_df <- cbind(within(flat_long_df, rm(ind)), 
      do.call("rbind", 
        lapply(strsplit(as.character(flat_long_df$ind), "\\."), function(x){
      data.frame(cbind(time = as.numeric(gsub("X", "", x[1])),
      seed = as.numeric(gsub("X", "", x[2])),
      treatment = as.numeric(paste(x[3], x[4], sep = ".")),
      var = paste(x[5], x[6], sep = ".")))
      }
    )
  )
)

# Split data by the var vector: df_list => list of data.frames
df_list <- split(long_df, long_df$var)

# Reshape from long to wide: wide_df => data.frame 
wide_df <- do.call("cbind", lapply(seq_along(df_list), function(i){
          setNames(within(df_list[[i]], rm(var)), 
             c(names(df_list)[i], "time", "seed", "treatment"))
    }
  )
)

【讨论】：

【解决方案5】：

使用递归：

nested_list <- function(ls, list_names = NULL){
  
  if(all(map_chr(ls, class) %in% c("numeric", "character", "integer", "logical"))){
      
    dt <- as.data.table(ls)
    dt[, (str_c("NAME_", seq_along(list_names))) := as.list(list_names)]
      
    return(dt)
  }

  ls %>% imap(~nested_list(.x, c(list_names, .y)))
}

NestedList <- function(ls, new_names){
  
  dt <- nested_list(ls) %>%
    {do.call(c, unlist(., recursive = FALSE))} %>%
    rbindlist() 
  
  setnames(dt, str_subset(names(dt), "NAME_"), new_names)
  dt
}

NestedList(ls, c("time", "seed", "tretment"))

【讨论】：