【问题标题】:Converting a deeply nested list to a dataframe将深度嵌套列表转换为数据框
【发布时间】:2020-09-15 05:16:27
【问题描述】:

我有一个深度嵌套的列表,我想将其转换为数据框。 结构如下:

ls <- list('10' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8)),
                                    '0.2' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8))),
                       '456' = list ('0.1' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)),
                                     '0.2' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)))),
           '20' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8)),
                                    '0.2' = list(Gmax.val = rnorm(1),
                                                 G2.val = rnorm(1),
                                                 Gmax.vec = rnorm(8),
                                                 G2.vec = rnorm(8))),
                       '456' = list ('0.1' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)),
                                     '0.2' = list(Gmax.val = rnorm(1),
                                                  G2.val = rnorm(1),
                                                  Gmax.vec = rnorm(8),
                                                  G2.vec = rnorm(8)))))


> ls[['10']][['123']][['0.1']]
$Gmax.val
[1] -0.1982298

$G2.val
[1] -0.2761515

$Gmax.vec
[1] -0.4732736 -0.5714809 -0.1600405 -0.7138532  0.3503852 -0.7367241  0.3024992 -0.4931045

$G2.vec
[1] -0.2374231 -0.7927135 -0.9554769  0.8733201 -0.4126742  1.8689940  0.1576750 -0.2184344

每个子列表名称都是不同变量的值:在本例中,可能是:

ls[[]] = time; 10 or 20
ls[[]][[]] = seed; 123 or 456
ls[[]][[]][[]] = treatment; 0.1 or 0.2 

理想情况下,我希望将子列表的名称用作它们自己列中的值。我希望数据框看起来像这样:

#  time seed treatment  Gmax.val     G2.val    Gmax.vec     G2.vec
#1   10  123       0.1 0.1972457 -0.1224265  0.06121407  1.5102516
#2   10  123       0.1 0.1972457 -0.1224265 -2.53026477 -0.1320042
#3   10  123       0.1 0.1972457 -0.1224265  0.06648820 -0.2477285
#4   10  123       0.1 0.1972457 -0.1224265 -0.45594701 -0.8577670
#5   10  123       0.1 0.1972457 -0.1224265  0.90828911 -1.0710828
#6   10  123       0.1 0.1972457 -0.1224265  0.56427976  1.5086222

感谢您的帮助。

【问题讨论】:

  • 这里有什么问题?你有没有尝试过这种渴望的陈述?这是如何获得 8 票的?请参阅how to ask

标签: r dataframe nested-lists


【解决方案1】:

另一种方法是:

  1. 在 rrapply-package 中使用 rrapply() 将嵌套列表融合到 data.frame(或使用 reshape2::melt() 类似)。
  2. 使用 tidyr 的 pivot_wider()unnest() 将 data.frame 重塑为所需的格式。
library(rrapply)
library(tidyverse)

rrapply(ls, how = "melt") %>%                            ## melt to long df
  pivot_wider(names_from = "L4") %>%                     ## reshape to wide df
  unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%      ## unnest list columns
  rename(time = L1, seed = L2, treatment = L3)           ## rename columns

#> # A tibble: 64 x 7
#>    time  seed  treatment Gmax.val G2.val Gmax.vec  G2.vec
#>    <chr> <chr> <chr>        <dbl>  <dbl>    <dbl>   <dbl>
#>  1 10    123   0.1         -0.626  0.184   -0.836  1.51  
#>  2 10    123   0.1         -0.626  0.184    1.60   0.390 
#>  3 10    123   0.1         -0.626  0.184    0.330 -0.621 
#>  4 10    123   0.1         -0.626  0.184   -0.820 -2.21  
#>  5 10    123   0.1         -0.626  0.184    0.487  1.12  
#>  6 10    123   0.1         -0.626  0.184    0.738 -0.0449
#>  7 10    123   0.1         -0.626  0.184    0.576 -0.0162
#>  8 10    123   0.1         -0.626  0.184   -0.305  0.944 
#>  9 10    123   0.2          0.821  0.594    0.919 -0.478 
#> 10 10    123   0.2          0.821  0.594    0.782  0.418 
#> # … with 54 more rows

或者使用data.table的dcast()将长表重塑为宽格式:

library(data.table)

long_dt <- as.data.table(rrapply(ls, how = "melt"))
wide_dt <- dcast(long_dt, L1 + L2 + L3 ~ L4)
wide_dt <- wide_dt[, lapply(.SD, unlist), by = list(L1, L2, L3), .SDcols = c("Gmax.val", "G2.val", "Gmax.vec", "G2.vec")]
setnames(wide_dt, old = c("L1", "L2", "L3"), new = c("time", "seed", "treatment"))

一些基准测试

microbenchmark::microbenchmark(
  tidyr = {
    rrapply(ls, how = "melt") %>%                            
      pivot_wider(names_from = "L4") %>%                     
      unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%      
      rename(time = L1, seed = L2, treatment = L3)
  },
  data.table = {
    wide_dt <- dcast(as.data.table(rrapply(ls, how = "melt")), L1 + L2 + L3 ~ L4)
    wide_dt <- wide_dt[, lapply(.SD, unlist), by = list(L1, L2, L3), .SDcols = c("Gmax.val", "G2.val", "Gmax.vec", "G2.vec")]
    setnames(wide_dt, old = c("L1", "L2", "L3"), new = c("time", "seed", "treatment"))
    wide_dt
  },
  times = 25
)
#> Unit: milliseconds
#>        expr       min        lq      mean    median        uq       max neval
#>       tidyr 17.959197 20.072647 23.662698 21.278771 25.633581 40.593022    25
#>  data.table  2.061861  2.655782  2.966581  2.784425  2.988044  5.032524    25

【讨论】:

  • rrapply 很棒,感谢您在这里展示。
【解决方案2】:

这是一个使用tidyr 中一些较新的"rectangling" 方法的解决方案。我发布这篇文章主要是为了获得和分享对这些功能的一些熟悉感——我的感觉是,这种方法肯定可以,嗯,整理一下。尽管如此,这是在宽/长列表拆包之间来回摆动的好方法。

library(tidyverse)
set.seed(1L)

tibble(time = names(data), data = data) %>%
  unnest_wider(data) %>%
  pivot_longer(-time, names_to = "seed", values_to = "treatment") %>%
  unnest_wider(treatment) %>%
  pivot_longer(-c(time, seed), names_to = "treatment", values_to = "g_data") %>%
  unnest_wider(g_data) %>%
  mutate(row_n = row_number()) %>%
  pivot_longer(c(Gmax.vec, G2.vec), names_to = "g", values_to = "g_val") %>%
  unnest_longer(g_val) %>%
  group_by(row_n, time, seed, treatment, Gmax.val, G2.val, g) %>%
  mutate(sub_n = row_number()) %>%
  pivot_wider(names_from = g, values_from = g_val) %>%
  ungroup() %>%
  select(-row_n, -sub_n) 

  # A tibble: 64 x 7
   time  seed  treatment Gmax.val G2.val Gmax.vec  G2.vec
   <chr> <chr> <chr>        <dbl>  <dbl>    <dbl>   <dbl>
 1 10    123   0.1         -0.626  0.184   -0.836  1.51  
 2 10    123   0.1         -0.626  0.184    1.60   0.390 
 3 10    123   0.1         -0.626  0.184    0.330 -0.621 
 4 10    123   0.1         -0.626  0.184   -0.820 -2.21  
 5 10    123   0.1         -0.626  0.184    0.487  1.12  
 6 10    123   0.1         -0.626  0.184    0.738 -0.0449
 7 10    123   0.1         -0.626  0.184    0.576 -0.0162
 8 10    123   0.1         -0.626  0.184   -0.305  0.944 
 9 10    123   0.2          0.821  0.594    0.919 -0.478 
10 10    123   0.2          0.821  0.594    0.782  0.418 
# … with 54 more rows

【讨论】:

    【解决方案3】:

    这是基本 R 解决方案,比其他答案快 5-94 倍。

    您可以使用这样的功能,即使您更改最内层列表的名称也可以使用:

    # turns a deep nested list to a data.frame.
    #
    # Args:
    #   x: list of nested lists. All needs to have identical setup and
    #      names.
    #   cnam: character vector with column names for the columns which are
    #         from the non-terminal lists.
    deep_nested_to_df <- function(x, cnam)
      .deep_nested_to_df(x, cnam)
    
    # do not call this function
    .deep_nested_to_df <- function(x, cnam, idx = 1L){
      # check if all elements are lists
      is_all_lists <- all(sapply(x, is.list))
    
      if(is_all_lists){
        # create data.frames out of elements
        out <- lapply(x, .deep_nested_to_df, cnam = cnam, idx = idx + 1L)
    
        # check that all column names match
        my_cnam <- colnames(out[[1L]])
        stopifnot(all(length(out[[1L]]) == sapply(out, length)),
                  all(sapply(out, function(x) all(colnames(x) == my_cnam))))
    
        # create the new colum
        new_col <- c(mapply(rep, x = names(x), times = sapply(out, NROW)))
    
        # combine to one data.frame
        out <- do.call(rbind, out)
    
        # add the new column
        out <- do.call(cbind, list(
          as.data.frame(new_col, stringsAsFactors = FALSE), out))
        colnames(out)[1L] <- cnam[idx]
        if(idx == 1L)
          rownames(out) <- 1:NROW(out)
        return(out)
      }
    
      as.data.frame(x, stringsAsFactors = FALSE)
    }
    
    # use the function
    res <- deep_nested_to_df(ls, c("time", "seed", "treatment"))
    head(res, 16)
    #R>    time seed treatment Gmax.val G2.val Gmax.vec G2.vec
    #R> 1    10  123       0.1    -0.63   0.18   -0.836  1.512
    #R> 2    10  123       0.1    -0.63   0.18    1.595  0.390
    #R> 3    10  123       0.1    -0.63   0.18    0.330 -0.621
    #R> 4    10  123       0.1    -0.63   0.18   -0.820 -2.215
    #R> 5    10  123       0.1    -0.63   0.18    0.487  1.125
    #R> 6    10  123       0.1    -0.63   0.18    0.738 -0.045
    #R> 7    10  123       0.1    -0.63   0.18    0.576 -0.016
    #R> 8    10  123       0.1    -0.63   0.18   -0.305  0.944
    #R> 9    10  123       0.2     0.82   0.59    0.919 -0.478
    #R> 10   10  123       0.2     0.82   0.59    0.782  0.418
    #R> 11   10  123       0.2     0.82   0.59    0.075  1.359
    #R> 12   10  123       0.2     0.82   0.59   -1.989 -0.103
    #R> 13   10  123       0.2     0.82   0.59    0.620  0.388
    #R> 14   10  123       0.2     0.82   0.59   -0.056 -0.054
    #R> 15   10  123       0.2     0.82   0.59   -0.156 -1.377
    #R> 16   10  123       0.2     0.82   0.59   -1.471 -0.415
    str(res)
    #R> 'data.frame':   64 obs. of  7 variables:
    #R>  $ time     : chr  "10" "10" "10" "10" ...
    #R>  $ seed     : chr  "123" "123" "123" "123" ...
    #R>  $ treatment: chr  "0.1" "0.1" "0.1" "0.1" ...
    #R>  $ Gmax.val : num  -0.626 -0.626 -0.626 -0.626 -0.626 ...
    #R>  $ G2.val   : num  0.184 0.184 0.184 0.184 0.184 ...
    #R>  $ Gmax.vec : num  -0.836 1.595 0.33 -0.82 0.487 ...
    #R>  $ G2.vec   : num  1.512 0.39 -0.621 -2.215 1.125 ...
    

    该功能可能会更快。我怀疑对as.data.framerbind 的调用会大大减慢函数的运行速度。尽管如此,它仍然有效。

    更快的版本

    更快的版本类似于:

    deep_nested_to_df_fast <- function(x, cnam)
      .deep_nested_to_df_fast(x, cnam)
    
    .deep_nested_to_df_fast <- function(x, cnam, idx = 1L){
      # check if all elements are list
      is_all_lists <- all(sapply(x, is.list))
    
      if(is_all_lists){
        # create data.frames out of elements
        out <- lapply(x, .deep_nested_to_df_fast, cnam = cnam, idx = idx + 1L)
    
        # check that all column names match
        my_cnam <- colnames(out[[1L]])
        stopifnot(all(length(out[[1L]]) == sapply(out, length)),
                  all(sapply(out, function(x) all(colnames(x) == my_cnam))))
    
        # create the new colum
        new_col <- mapply(
          rep, x = names(x), times = sapply(out, function(x) length(x[[1L]])),
          SIMPLIFY = FALSE)
        new_col <- do.call(c, new_col)
    
        # combine to a list instead of a data.frame
        out <- do.call(mapply, c(list(FUN = c, SIMPLIFY = FALSE), out))
    
        # add the new colum
        out <- c(list(new_col), out)
        names(out)[1L] <- cnam[idx]
    
        if(idx == 1L)
          # turn it into a data.frame
          out <- structure(
            lapply(out, unname), names = names(out),
            row.names = 1:length(out[[1L]]), class = "data.frame")
    
        return(out)
      }
    
      # create list of element with an equal number
      ele_length <- sapply(x, length)
    
      # check that all have either the maximum number of elements or one
      # element
      max_len <- max(ele_length)
      stopifnot(all(ele_length %in% c(1L, max_len)))
    
      # return list like data.frame
      lapply(x, rep, length.out = max_len)
    }
    

    在这种情况下,它给出了相同的结果并且速度更快:

    # gives the same
    res_fast <- deep_nested_to_df_fast(ls, c("time", "seed", "treatment"))
    all.equal(res, res_fast)
    #R> [1] TRUE
    
    # check computation time. We also compare with other answers
    library(rrapply)
    library(tidyverse)
    library(data.table)
    
    Joris_ans <- function()
      rrapply(ls, how = "melt") %>%
        pivot_wider(names_from = "L4") %>%
        unnest(c(Gmax.val, G2.val, Gmax.vec, G2.vec)) %>%
        rename(time = L1, seed = L2, treatment = L3)
    
    Andrew_ans <- function(data = ls)
      tibble(time = names(data), data = data) %>%
        unnest_wider(data) %>%
        pivot_longer(-time, names_to = "seed", values_to = "treatment") %>%
        unnest_wider(treatment) %>%
        pivot_longer(-c(time, seed), names_to = "treatment", values_to = "g_data") %>%
        unnest_wider(g_data) %>%
        mutate(row_n = row_number()) %>%
        pivot_longer(c(Gmax.vec, G2.vec), names_to = "g", values_to = "g_val") %>%
        unnest_longer(g_val) %>%
        group_by(row_n, time, seed, treatment, Gmax.val, G2.val, g) %>%
        mutate(sub_n = row_number()) %>%
        pivot_wider(names_from = g, values_from = g_val) %>%
        ungroup() %>%
        select(-row_n, -sub_n)
    
    hello_friend_ans <- function(){
      flat_long_df <- stack(data.frame(do.call("c", lapply(Map(function(x){
        do.call(c, x)}, ls), data.frame))))
      
      long_df <- cbind(within(flat_long_df, rm(ind)), 
                       do.call("rbind", lapply(strsplit(as.character(flat_long_df$ind), "\\."), function(x){
                         data.frame(cbind(time = as.numeric(gsub("X", "", x[1])),
                                          seed = as.numeric(gsub("X", "", x[2])),
                                          treatment = as.numeric(paste(x[3], x[4], sep = ".")),
                                          var = paste(x[5], x[6], sep = ".")))
                       }
                       )
                       )
      )
      
      wide_df <- setNames(reshape(long_df, 
                                  idvar= c("time", "seed", "treatment"), timevar="var", direction="wide"),
                          c("time", "seed", "treatment", "Gmax.val", "G2.val", "Gmax.vec", "G2.vec"))
      wide_df
    }
    
    det_ans <- function(){
      nested_list <- function(ls, list_names = NULL){
        
        if(all(map_chr(ls, class) %in% c("numeric", "character", "integer", "logical"))){
          
          dt <- as.data.table(ls)
          dt[, (str_c("NAME_", seq_along(list_names))) := as.list(list_names)]
          
          return(dt)
        }
        
        ls %>% imap(~nested_list(.x, c(list_names, .y)))
      }
      
      NestedList <- function(ls, new_names){
        
        dt <- nested_list(ls) %>%
          {do.call(c, unlist(., recursive = FALSE))} %>%
          rbindlist() 
        
        setnames(dt, str_subset(names(dt), "NAME_"), new_names)
        dt
      }
      
      NestedList(ls, c("time", "seed", "tretment"))
    }
    
    
    bench::mark(
      first = deep_nested_to_df     (ls, c("time", "seed", "treatment")),
      fast  = deep_nested_to_df_fast(ls, c("time", "seed", "treatment")),
      Joris_ans(), Andrew_ans(), hello_friend_ans(), det_ans(),
      min_time = 1, check = FALSE, relative = TRUE)
    #R> # A tibble: 6 x 13
    #R>   expression            min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory time  gc   
    #R>   <bch:expr>          <dbl>  <dbl>     <dbl>     <dbl>    <dbl> <int> <dbl>   <bch:tm> <list> <list> <lis> <lis>
    #R> 1 first                4.44   4.39     20.1       1.19     1      207     7    925.3ms <NULL> <Rpro… <bch… <tib…
    #R> 2 fast                 1      1        89.2       1        1.28   923     9    930.3ms <NULL> <Rpro… <bch… <tib…
    #R> 3 Joris_ans()          9.90   9.85      9.01      1.92     1.03    90     7    897.9ms <NULL> <Rpro… <bch… <tib…
    #R> 4 Andrew_ans()        60.2   59.1       1.62     10.9      1.51    11     7    611.5ms <NULL> <Rpro… <bch… <tib…
    #R> 5 hello_friend_ans() 103.    94.9       1       203.      14.7      1    10     89.9ms <NULL> <Rpro… <bch… <tib…
    #R> 6 det_ans()            5.87   5.73     15.4     106.       1.01   157     7      914ms <NULL> <Rpro… <bch… <tib…
    

    比第一个版本快 4 倍,比其他答案快 5-94 倍。

    我们可以通过使用预先分配我们将写入的最终向量的 C++/C 实现来加快速度。这样,我们就避免了上面重复的内存分配。

    数据

    set.seed(1L)
    ls <- list('10' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
                                                     G2.val = rnorm(1),
                                                     Gmax.vec = rnorm(8),
                                                     G2.vec = rnorm(8)),
                                        '0.2' = list(Gmax.val = rnorm(1),
                                                     G2.val = rnorm(1),
                                                     Gmax.vec = rnorm(8),
                                                     G2.vec = rnorm(8))),
                           '456' = list ('0.1' = list(Gmax.val = rnorm(1),
                                                      G2.val = rnorm(1),
                                                      Gmax.vec = rnorm(8),
                                                      G2.vec = rnorm(8)),
                                         '0.2' = list(Gmax.val = rnorm(1),
                                                      G2.val = rnorm(1),
                                                      Gmax.vec = rnorm(8),
                                                      G2.vec = rnorm(8)))),
               '20' = list('123' = list('0.1' = list(Gmax.val = rnorm(1),
                                                     G2.val = rnorm(1),
                                                     Gmax.vec = rnorm(8),
                                                     G2.vec = rnorm(8)),
                                        '0.2' = list(Gmax.val = rnorm(1),
                                                     G2.val = rnorm(1),
                                                     Gmax.vec = rnorm(8),
                                                     G2.vec = rnorm(8))),
                           '456' = list ('0.1' = list(Gmax.val = rnorm(1),
                                                      G2.val = rnorm(1),
                                                      Gmax.vec = rnorm(8),
                                                      G2.vec = rnorm(8)),
                                         '0.2' = list(Gmax.val = rnorm(1),
                                                      G2.val = rnorm(1),
                                                      Gmax.vec = rnorm(8),
                                                      G2.vec = rnorm(8)))))
    

    您可能想要使用除 ls 之外的另一个变量名,因为有一个 ls 函数。

    【讨论】:

      【解决方案4】:

      基础 R 解决方案:

      # Flatten the nested list: flat_long_df => data.frame
      flat_long_df <- stack(data.frame(do.call("c", lapply(Map(function(x){
        do.call(c, x)}, ls), data.frame))))
      
      # Derive the features: data.frame => console (stdout)
      long_df <- cbind(within(flat_long_df, rm(ind)), 
            do.call("rbind", 
              lapply(strsplit(as.character(flat_long_df$ind), "\\."), function(x){
            data.frame(cbind(time = as.numeric(gsub("X", "", x[1])),
            seed = as.numeric(gsub("X", "", x[2])),
            treatment = as.numeric(paste(x[3], x[4], sep = ".")),
            var = paste(x[5], x[6], sep = ".")))
            }
          )
        )
      )
      
      # Split data by the var vector: df_list => list of data.frames
      df_list <- split(long_df, long_df$var)
      
      # Reshape from long to wide: wide_df => data.frame 
      wide_df <- do.call("cbind", lapply(seq_along(df_list), function(i){
                setNames(within(df_list[[i]], rm(var)), 
                   c(names(df_list)[i], "time", "seed", "treatment"))
          }
        )
      )
      

      【讨论】:

        【解决方案5】:

        使用递归:

        nested_list <- function(ls, list_names = NULL){
          
          if(all(map_chr(ls, class) %in% c("numeric", "character", "integer", "logical"))){
              
            dt <- as.data.table(ls)
            dt[, (str_c("NAME_", seq_along(list_names))) := as.list(list_names)]
              
            return(dt)
          }
        
          ls %>% imap(~nested_list(.x, c(list_names, .y)))
        }
        
        NestedList <- function(ls, new_names){
          
          dt <- nested_list(ls) %>%
            {do.call(c, unlist(., recursive = FALSE))} %>%
            rbindlist() 
          
          setnames(dt, str_subset(names(dt), "NAME_"), new_names)
          dt
        }
        
        NestedList(ls, c("time", "seed", "tretment"))
        

        【讨论】:

          猜你喜欢
          • 1970-01-01
          • 1970-01-01
          • 2020-06-22
          • 2021-06-17
          • 1970-01-01
          • 2013-02-18
          • 1970-01-01
          • 1970-01-01
          相关资源
          最近更新 更多