【问题标题】:uniting multiple columns that contain empty NA values合并包含空 NA 值的多个列
【发布时间】:2019-03-11 01:16:01
【问题描述】:

我有一个数据集,其中创建了多个列,但数据相同(boxID)。我想合并列,以便我只有 boxID(一个字母数字代码:两个字母的状态缩写和 2 个数字)而不是 NA 值,这就是现在我使用 unite() 函数时发生的情况dplyr。是否有类似的功能可以做到这一点,还是我需要根据与 stringr 的模式匹配来提取 boxID?

dat <- structure(list(boxId = c("CA04", "CA04", "CA01", "CA02", "CA04", 
"CA02", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), boxId__1 = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "NM01", "NM14", "NM15", 
"NM16", "NM17", "NM18", "NM19", "NM20", "NM02", "NM03", "NM04", 
"NM05", "NM06", "NM07", "NM08", "NM09", "NM10", "NM11", "NM12", 
"NM13"), boxId__2 = c(NA, NA, NA, NA, NA, NA, "FL01", "FL02", 
"FL03", "FL09", "FL08", "FL07", "FL04", "FL05", "FL06", "FL10", 
"FL11", "FL13", "FL12", "FL20", "FL19", "FL18", "FL17", "FL16", 
"FL14", "FL15", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), boxID = c(NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), boxID__1 = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
    boxID__2 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA), boxID__3 = c(NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, "IN05", NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
    ), boxID__4 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA), boxID__5 = c(NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
    boxID__6 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA), boxID__7 = c(NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
    boxID__8 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA), boxID__9 = c(NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, "WA11", NA, NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
    )), row.names = c(NA, -48L), class = c("tbl_df", "tbl", "data.frame"
))

数据如下所示:

# A tibble: 48 x 13
   boxId boxId__1 boxId__2 boxID boxID__1 boxID__2 boxID__3 boxID__4 boxID__5 boxID__6
   <chr> <chr>    <chr>    <lgl> <lgl>    <lgl>    <chr>    <lgl>    <lgl>    <lgl>   
 1 CA04  NA       NA       NA    NA       NA       NA       NA       NA       NA      
 2 CA04  NA       NA       NA    NA       NA       NA       NA       NA       NA      
 3 CA01  NA       NA       NA    NA       NA       NA       NA       NA       NA      
 4 CA02  NA       NA       NA    NA       NA       NA       NA       NA       NA      
 5 CA04  NA       NA       NA    NA       NA       NA       NA       NA       NA      
 6 CA02  NA       NA       NA    NA       NA       NA       NA       NA       NA      
 7 NA    NA       FL01     NA    NA       NA       NA       NA       NA       NA      
 8 NA    NA       FL02     NA    NA       NA       NA       NA       NA       NA      
 9 NA    NA       FL03     NA    NA       NA       NA       NA       NA       NA      
10 NA    NA       FL09     NA    NA       NA       NA       NA       NA       NA      
# … with 38 more rows, and 3 more variables: boxID__7 <lgl>, boxID__8 <lgl>, boxID__9 <chr>

当我使用 unite() 时,它看起来像这样:

dat %>%
  unite('newID')

我坚持使用这些 NA 值:

# A tibble: 48 x 1
   newID                                   
   <chr>                                   
 1 CA04_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
 2 CA04_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
 3 CA01_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
 4 CA02_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
 5 CA04_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
 6 CA02_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
 7 NA_NA_FL01_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA
 8 NA_NA_FL02_NA_NA_NA_NA_NA_NA_NA_NA_NA_NA

【问题讨论】:

    标签: r dplyr stringr


    【解决方案1】:

    你知道每一行只有一个非 NA 吗?如果是,那么您可以旋转表格并删除所有 NA。您将获得一个行数与原始表相同的表。

    library("tidyverse")
    
    dat %>%
      # Adding row ID for clarity
      mutate(row = row_number()) %>%
      gather(box, name, - row) %>%
      drop_na()
    #> # A tibble: 48 x 3
    #>      row box      name 
    #>    <int> <chr>    <chr>
    #>  1     1 boxId    CA04 
    #>  2     2 boxId    CA04 
    #>  3     3 boxId    CA01 
    #>  4     4 boxId    CA02 
    #>  5     5 boxId    CA04 
    #>  6     6 boxId    CA02 
    #>  7    29 boxId__1 NM01 
    #>  8    30 boxId__1 NM14 
    #>  9    31 boxId__1 NM15 
    #> 10    32 boxId__1 NM16 
    #> # ... with 38 more rows
    

    reprex package (v0.2.1) 于 2019-03-11 创建

    【讨论】:

      【解决方案2】:

      基本的 R 方法是 unlist 数据框中的所有值,并仅选择非 NA 值以创建具有一列的新数据框。

      x <- unlist(dat)
      data.frame(new_id = x[!is.na(x)])
      
      #           new_id
      #boxId1       CA04
      #boxId2       CA04
      #boxId3       CA01
      #boxId4       CA02
      #boxId5       CA04
      #boxId6       CA02
      #boxId__129   NM01
      #boxId__130   NM14
      #boxId__131   NM15
      #......
      

      【讨论】:

        【解决方案3】:

        coalesce:

        dat %>% 
          mutate_all(as.character) %>% 
          transmute(newID = coalesce(!!! syms(names(.))))
        
        # # A tibble: 48 x 1
        #    newID
        #    <chr>
        #  1 CA04 
        #  2 CA04 
        #  3 CA01 
        #  4 CA02 
        #  5 CA04 
        #  6 CA02 
        #  7 FL01 
        #  8 FL02 
        #  9 FL03 
        # 10 FL09 
        # # … with 38 more rows
        

        【讨论】:

          猜你喜欢
          • 1970-01-01
          • 2017-05-22
          • 2012-07-21
          • 2020-12-02
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 2020-09-22
          相关资源
          最近更新 更多