【问题标题】:查找一列逗号分隔类别的组合
【发布时间】:2022-01-23 13:00:28
【问题描述】:

我有一列以逗号分隔的多个类别。类似的东西

id categories
1 A, B, C, D
2 A, F, X, G
3 B, Y, X, D

如何生成可能出现两个类别的两列,类似这样

id category 1 category 2
1 A B
1 A C
1 A D
1 B C
1 B D
1 C D
2 A F
2 A X
2 A G

等等。

提前致谢!

【问题讨论】:

    标签: r combinations


    【解决方案1】:

    使用gregexpr

    z <- dat$categories
    t(do.call(cbind, lapply(regmatches(z, gregexpr(z, pa='\\w')), combn, 2)))
    #      [,1] [,2]
    #  [1,] "A"  "B" 
    #  [2,] "A"  "C" 
    #  [3,] "A"  "D" 
    #  [4,] "B"  "C" 
    #  [5,] "B"  "D" 
    #  [6,] "C"  "D" 
    #  [7,] "A"  "F" 
    #  [8,] "A"  "X" 
    #  [9,] "A"  "G" 
    # [10,] "F"  "X" 
    # [11,] "F"  "G" 
    # [12,] "X"  "G" 
    # [13,] "B"  "Y" 
    # [14,] "B"  "X" 
    # [15,] "B"  "D" 
    # [16,] "Y"  "X" 
    # [17,] "Y"  "D" 
    # [18,] "X"  "D" 
    

    按身份证件

    do.call(rbind.data.frame, by(dat, dat$categories, \(x) {
      z <- x$categories
      cbind(id=x$id, 
            t(do.call(cbind, lapply(regmatches(z, gregexpr(z, pa='\\w')), combn, 2))))
    }))
    #                id X1 X2
    # A, B, C, D.1    1  A  B
    # A, B, C, D.2    1  A  C
    # A, B, C, D.3    1  A  D
    # A, B, C, D.4    1  B  C
    # A, B, C, D.5    1  B  D
    # A, B, C, D.6    1  C  D
    # A, F, X, G.1    2  A  F
    # A, F, X, G.2    2  A  X
    # A, F, X, G.3    2  A  G
    # A, F, X, G.4    2  F  X
    # A, F, X, G.5    2  F  G
    # A, F, X, G.6    2  X  G
    # B, Y, X, D.1    3  B  Y
    # B, Y, X, D.2    3  B  X
    # B, Y, X, D.3    3  B  D
    # B, Y, X, D.4    3  Y  X
    # B, Y, X, D.5    3  Y  D
    # B, Y, X, D.6    3  X  D
    

    注意:"R version 4.1.2 (2021-11-01)"


    数据:

    dat <- structure(list(id = 1:3, categories = c("A, B, C, D", "A, F, X, G", 
    "B, Y, X, D")), class = "data.frame", row.names = c(NA, -3L))
    

    【讨论】:

      【解决方案2】:

      data.table 选项

      > setDT(df)[, data.table(t(combn(scan(text = categories, what = "character", sep = ","), 2))), id]
      Read 4 items
      Read 4 items
      Read 4 items
          id V1 V2
       1:  1  A  B
       2:  1  A  C
       3:  1  A  D
       4:  1  B  C
       5:  1  B  D
       6:  1  C  D
       7:  2  A  F
       8:  2  A  X
       9:  2  A  G
      10:  2  F  X
      11:  2  F  G
      12:  2  X  G
      13:  3  B  Y
      14:  3  B  X
      15:  3  B  D
      16:  3  Y  X
      17:  3  Y  D
      18:  3  X  D
      

      或者,我们可以像下面这样使用dplyr 管道

      df %>%
        group_by(id) %>%
        mutate(categories = list(data.frame(t(combn(unlist(strsplit(categories, ", ")), 2))))) %>%
        unnest(categories) %>%
        ungroup()
      

      给了

            id X1    X2
         <int> <chr> <chr>
       1     1 A     B
       2     1 A     C
       3     1 A     D
       4     1 B     C
       5     1 B     D
       6     1 C     D
       7     2 A     F
       8     2 A     X
       9     2 A     G
      10     2 F     X
      11     2 F     G
      12     2 X     G
      13     3 B     Y
      14     3 B     X
      15     3 B     D
      16     3 Y     X
      17     3 Y     D
      

      数据

      > dput(df)
      structure(list(id = 1:3, categories = c("A, B, C, D", "A, F, X, G",
      "B, Y, X, D")), class = "data.frame", row.names = c(NA, -3L))
      

      【讨论】:

        【解决方案3】:

        使用 tidyverse 的解决方案:

        1. 使用strsplit()(或stringr::str_split()从原始数据中获取每个类别。
        2. id 拆分数据,然后使用每种可能的组合为该ID 生成一个子数据帧。
        3. 将表格重新连接在一起(此步骤可以方便地使用与步骤 2 相同的功能,使用 purrr::map_df())。
        library(tidyverse)
        data %>% 
          mutate(all = str_split(categories, ", ")) %>% 
          split(.$id) %>% 
          map_df(function(df) {
            combs = t(combn(unlist(df$all), m = 2))
            tibble(id = df$id, cat_1 = combs[, 1], cat_2 = combs[, 2])
          })
        

        输出

        # A tibble: 18 x 3
              id cat_1 cat_2
           <dbl> <chr> <chr>
         1     1 A     B    
         2     1 A     C    
         3     1 A     D    
         4     1 B     C    
         5     1 B     D    
         6     1 C     D    
         7     2 A     F    
         8     2 A     X    
         9     2 A     G    
        10     2 F     X    
        11     2 F     G    
        12     2 X     G    
        13     3 B     Y    
        14     3 B     X    
        15     3 B     D    
        16     3 Y     X    
        17     3 Y     D    
        18     3 X     D    
        

        【讨论】:

          【解决方案4】:

          另一个基础R

          do.call(
            rbind,
            lapply(
              split(df,df$id),
              function(x){
                cbind(
                  x$id,
                  t(combn(strsplit(x$categories,", ")[[1]],2))
                )
              }
            )
          )
          
                [,1] [,2] [,3]
           [1,] "1"  "A"  "B" 
           [2,] "1"  "A"  "C" 
           [3,] "1"  "A"  "D" 
           [4,] "1"  "B"  "C" 
           [5,] "1"  "B"  "D" 
           [6,] "1"  "C"  "D" 
           [7,] "2"  "A"  "F" 
           [8,] "2"  "A"  "X" 
           [9,] "2"  "A"  "G" 
          [10,] "2"  "F"  "X" 
          [11,] "2"  "F"  "G" 
          [12,] "2"  "X"  "G" 
          [13,] "3"  "B"  "Y" 
          [14,] "3"  "B"  "X" 
          [15,] "3"  "B"  "D" 
          [16,] "3"  "Y"  "X" 
          [17,] "3"  "Y"  "D" 
          [18,] "3"  "X"  "D"
          

          【讨论】:

            【解决方案5】:

            可以拆分使用combn,即

            do.call(rbind, lapply(strsplit(df$categories, ', '), function(i)data.frame(t(combn(i, 2)))))
            
            #   X1 X2
            #1   A  B
            #2   A  C
            #3   A  D
            #4   B  C
            #5   B  D
            #6   C  D
            #7   A  F
            #8   A  X
            #9   A  G
            #10  F  X
            #11  F  G
            #12  X  G
            #13  B  Y
            #14  B  X
            #15  B  D
            #16  Y  X
            #17  Y  D
            #18  X  D
            

            【讨论】:

              猜你喜欢
              • 1970-01-01
              • 1970-01-01
              • 2019-05-19
              • 2022-01-07
              • 2013-10-26
              • 1970-01-01
              • 2015-07-29
              • 1970-01-01
              • 1970-01-01
              相关资源
              最近更新 更多