【问题标题】:R convert tidy hierarchical data frame to hierarchical listR将整洁的分层数据框转换为分层列表
【发布时间】:2018-03-17 21:11:01
【问题描述】:

转换这个

g1    g2    desc    val
A     a     1       v1
A     a     2       v2
A     b     3       v3

收件人:

desc    val
A
a
1       v1
2       v2
b
3       v3

我已使用 for 循环将具有两个分组级别的分层数据框转换为结构化列表。这会在列表中显示带有关联变量的描述,并按顺序穿插在组级别中。

目的是将分层数据显示为一个列表,以便可以使用 openxlsx 以格式打印以区分不同的分组级别。

是否有更有效的基础 R、tidyverse 或其他方法来实现这一目标?

循环代码

tib <-  tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
          g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
          desc = 1:12,
          val = paste0("v", 1:12))

# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)

# create empty output tibble
output <- 
    as_tibble(matrix(nrow = n_rows, ncol = 2)) %>% 
    rename(desc = V1, val = V2) %>% 
    mutate(desc = NA_character_,
           val = NA_real_)

# loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1

for(i in seq_len(nrow(tib))){

  # level 1 headings
  if(tib$g1[[i]] != level_1) {
    output$desc[[output_row]] <- tib$g1[[i]]
    output_row <- output_row + 1
    }

  # level 2 headings
  if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
    output$desc[[output_row]] <- tib$g2[[i]]
    output_row <- output_row + 1
  }

  level_1 <- tib$g1[[i]]
  level_2 <- tib$g2[[i]]

  # Description and data
  output$desc[[output_row]] <- tib$desc[[i]]
  output$val[[output_row]] <- tib$val[[i]]
  output_row <- output_row + 1

}

【问题讨论】:

    标签: r list hierarchical-data


    【解决方案1】:

    使用tidyverse 中的几个包,我们可以做到:

    library(tidyverse)
    
    # or explicitly load what you need
    library(purrr)
    library(dplyr)
    library(tidyr)
    library(stringr)
    
    transpose(df) %>% 
      unlist() %>% 
      stack() %>% 
      distinct(values, ind) %>% 
      mutate(detect_var = str_detect(values, "^v"),
             ind = lead(case_when(detect_var == TRUE ~ values)),
             values = case_when(detect_var == TRUE ~ NA_character_,
                                TRUE ~ values)) %>% 
      drop_na(values) %>% 
      select(values, ind) %>% 
      replace_na(list(ind = ""))
    

    返回:

      values ind
    1      A    
    2      a    
    3      1  v1
    5      2  v2
    7      b    
    8      3  v3
    

    使用tib数据集,我的解决方案似乎比Plamen的慢一点:

    Unit: milliseconds
           expr       min        lq      mean    median        uq        max neval
            old 17.658398 18.492957 21.292965 19.396304 21.770249 133.215223   100
     new_simple  6.742158  7.013732  7.638155  7.190095  7.759104  12.640237   100
       new_fast  4.064907  4.266243  4.837131  4.507865  4.871533   9.442904   100
      tidyverse  4.980664  5.326694  6.004602  5.552611  6.215129   9.923524   100
    

    【讨论】:

    • mutate(g2 = paste(g1, g2, sep = "_")) 添加到开头,将mutate(values = str_remove(values, "\\D_")) 添加到末尾,即可完成这项工作。实际数据包括许多与 desc 变量相关的不同类型的变量。我正在努力使这段代码适应实际数据,所以问了另一个问题。
    【解决方案2】:

    我相信您可以像这样简化和稍微优化您的代码:

    library(dplyr)
    library(tidyr)
    library(microbenchmark)
    
    microbenchmark(
      old = {
    
        tib <-  tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
                       g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
                       desc = 1:12,
                       val = paste0("v", 1:12))
    
        # Number of rows in final table
        n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
    
        # create empty output tibble
        output <- 
          as_tibble(matrix(nrow = n_rows, ncol = 2)) %>% 
          rename(desc = V1, val = V2) %>% 
          mutate(desc = NA_character_,
                 val = NA_real_)
    
        # loop counters
        level_1 <- 0
        level_2 <- 0
        output_row <- 1
    
        for(i in seq_len(nrow(tib))){
    
          # level 1 headings
          if(tib$g1[[i]] != level_1) {
            output$desc[[output_row]] <- tib$g1[[i]]
            output_row <- output_row + 1
          }
    
          # level 2 headings
          if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
            output$desc[[output_row]] <- tib$g2[[i]]
            output_row <- output_row + 1
          }
    
          level_1 <- tib$g1[[i]]
          level_2 <- tib$g2[[i]]
    
          # Description and data
          output$desc[[output_row]] <- tib$desc[[i]]
          output$val[[output_row]] <- tib$val[[i]]
          output_row <- output_row + 1
    
        }
    
      }
      ,
      new_simple = {
    
        tib <-  tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
                       g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
                       desc = 1:12,
                       val = paste0("v", 1:12)) %>%
          unite('g1g2', g1, g2, remove = F)
    
        tib_list <- split(tib, tib$g1g2)
    
        convert_group <- function(sub_df){
          tibble(
            desc = c(sub_df$g1[1], sub_df$g2[2], sub_df$desc)
            , val = c(NA, NA, sub_df$val)
          )
        }
    
        res_df <- bind_rows(lapply(tib_list, convert_group))
      }
      ,
      new_fast = {
    
        tib <-  tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
                       g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
                       desc = 1:12,
                       val = paste0("v", 1:12)) %>%
          unite('g1g2', g1, g2, remove = F)
    
        tib_list <- split(tib, tib$g1g2)
    
        convert_desc <- function(sub_df){ 
          c(sub_df$g1[1], sub_df$g2[2], sub_df$desc)
        }
    
        convert_val <- function(sub_df){ c(NA, NA, sub_df$val) }
    
        res_df <- tibble(
          desc = sapply(tib_list, convert_desc)
          , val = sapply(tib_list, convert_val)
        )
      }
    )
    

    这给了我以下输出:

    Unit: milliseconds
           expr      min       lq     mean   median       uq       max neval
            old 41.06535 43.52606 49.42744 47.29305 52.74399  76.98021   100
     new_simple 57.08038 60.65657 68.11021 63.38157 71.62398 112.24893   100
       new_fast 24.16624 26.30785 31.07178 28.38764 31.91647 148.06442   100
    

    【讨论】:

    • 这会产生 g1-1, g2-1, desc1, desc2, g1-1, g2-1, desc3... 多余的 1 级实例被突出显示。需要的输出是g1-1,g2-1,desc1,desc2,g2-1,desc3...
    猜你喜欢
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2015-04-14
    • 2010-12-16
    • 2020-09-23
    • 1970-01-01
    相关资源
    最近更新 更多