【问题标题】:data.table - split multiple columnsdata.table - 拆分多列
【发布时间】:2014-12-31 11:43:04
【问题描述】:

我有一个data.table G 如下。

d <- list( c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"), c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"), c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"), c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"), c("SD455:TOMATO", "SD86:RAY", "SD39:MATURE", "SD86:COSMIC"), c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"),  c("SD32:TRUMPET", "SD1:FIELD", "SD40:GREEN", "SD40:PARK"))

d2 <-  lapply(d, function(x) sapply(strsplit(x, ":"), "[", 1))
d <- lapply(d, function(x) paste0(unique(x), collapse=", "))
d2 <- lapply(d2, function(x) paste0(unique(x), collapse=", "))
d <- as.data.frame(as.matrix(lapply(d, paste0, collapse=", ")))
d2 <- as.data.frame(as.matrix(lapply(d2, paste0, collapse=", ")))
d <- as.data.frame(cbind(d,d2))
colnames(d) <- c("sdw", "sd")
d$sd <- as.character(d$sd)
d$sdw <- as.character(d$sdw)

G <- data.table( d , key = "sd" )
G[, GNO:= seq(1,nrow(G))]
setcolorder(G, neworder=c("GNO", "sd", "sdw"))

G
    GNO                     sd                                              sdw
1:   1 SD1, SD44, SD384, SD32 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET
2:   2     SD12, SD315, SD387       SD12:PAINTING, SD315:MONEY31, SD387:SPRING
3:   3       SD23, SD1, SD567                SD23:SWITCH, SD1:LUSH, SD567:TREK
4:   4        SD32, SD1, SD40   SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK
5:   5    SD345, SD340, SD387               SD345:HOLE, SD340:DUST, SD387:ROLL
6:   6     SD42, SD345, SD183               SD42:CRAYON, SD345:FOX, SD183:WIRE
7:   7      SD455, SD86, SD39 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC

我正在尝试拆分sd列中的每组元素,并根据GNO列获取修改后的data.table,如下所示。

G2 <- G[, list(sd = unlist(strsplit( sd , "," ))) , by = list(GNO, sdw)]

G2
   GNO                                              sdw     sd
 1:   1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET    SD1
 2:   1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET   SD44
 3:   1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET  SD384
 4:   1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET   SD32
 5:   2       SD12:PAINTING, SD315:MONEY31, SD387:SPRING   SD12
 6:   2       SD12:PAINTING, SD315:MONEY31, SD387:SPRING  SD315
 7:   2       SD12:PAINTING, SD315:MONEY31, SD387:SPRING  SD387
 8:   3                SD23:SWITCH, SD1:LUSH, SD567:TREK   SD23
 9:   3                SD23:SWITCH, SD1:LUSH, SD567:TREK    SD1
10:   3                SD23:SWITCH, SD1:LUSH, SD567:TREK  SD567
11:   4   SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK   SD32
12:   4   SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK    SD1
13:   4   SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK   SD40
14:   5               SD345:HOLE, SD340:DUST, SD387:ROLL  SD345
15:   5               SD345:HOLE, SD340:DUST, SD387:ROLL  SD340
16:   5               SD345:HOLE, SD340:DUST, SD387:ROLL  SD387
17:   6               SD42:CRAYON, SD345:FOX, SD183:WIRE   SD42
18:   6               SD42:CRAYON, SD345:FOX, SD183:WIRE  SD345
19:   6               SD42:CRAYON, SD345:FOX, SD183:WIRE  SD183
20:   7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC  SD455
21:   7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC   SD86
22:   7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC   SD39

我也想拆分sdw中的元素。但是当我尝试这样做时,由于sdsdw 中的元素数量不一定相同,这一切都变得一团糟。

t <- G[ , list(sd = unlist(strsplit(sd, "," )),
               sdw = unlist(strsplit(sdw, "," ))) , by = list(GNO) ]

它给出警告

Warning message:
In `[.data.table`(G, , list(sd = unlist(strsplit(sd, ",")), sdw = unlist(strsplit(sdw,  :
  Column 1 of result for group 4 is length 3 but the longest column in this result is 4. Recycled leaving remainder of 1 items. This warning is once only for the first group with this issue.

我要找的输出如下。

out <- structure(list(GNO = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 
5, 5, 5, 6, 6, 6, 7, 7, 7), sd = structure(c(1L, 14L, 9L, 6L, 
2L, 5L, 10L, 4L, 1L, 16L, 6L, 1L, 12L, 8L, 7L, 10L, 13L, 8L, 
3L, 15L, 17L, 11L), .Label = c("SD1", "SD12", "SD183", "SD23", 
"SD315", "SD32", "SD340", "SD345", "SD384", "SD387", "SD39", 
"SD40", "SD42", "SD44", "SD455", "SD567", "SD86"), class = "factor"), 
    sdw = structure(c(2L, 17L, 11L, 7L, 3L, 6L, 13L, 5L, 2L, 
    19L, 7L, 1L, 15L, 10L, 8L, 12L, 16L, 9L, 4L, 18L, 20L, 14L
    ), .Label = c("SD1:FIELD", "SD1:LUSH", "SD12:PAINTING", "SD183:WIRE", 
    "SD23:SWITCH", "SD315:MONEY31", "SD32:TRUMPET", "SD340:DUST", 
    "SD345:FOX", "SD345:HOLE", "SD384:FR563", "SD387:ROLL", "SD387:SPRING", 
    "SD39:MATURE", "SD40:GREEN, SD40:PARK", "SD42:CRAYON", "SD44:CANCEL", 
    "SD455:TOMATO", "SD567:TREK", "SD86:COSMIC, SD86:RAY"), class = "factor")), .Names = c("GNO", 
"sd", "sdw"), row.names = c(NA, -22L), class = "data.frame")

out
   GNO    sd                   sdw
1    1   SD1              SD1:LUSH
2    1  SD44           SD44:CANCEL
3    1 SD384           SD384:FR563
4    1  SD32          SD32:TRUMPET
5    2  SD12         SD12:PAINTING
6    2 SD315         SD315:MONEY31
7    2 SD387          SD387:SPRING
8    3  SD23           SD23:SWITCH
9    3   SD1              SD1:LUSH
10   3 SD567            SD567:TREK
11   4  SD32          SD32:TRUMPET
12   4   SD1             SD1:FIELD
13   4  SD40 SD40:GREEN, SD40:PARK
14   5 SD345            SD345:HOLE
15   5 SD340            SD340:DUST
16   5 SD387            SD387:ROLL
17   6  SD42           SD42:CRAYON
18   6 SD345             SD345:FOX
19   6 SD183            SD183:WIRE
20   7 SD455          SD455:TOMATO
21   7  SD86 SD86:COSMIC, SD86:RAY
22   7  SD39           SD39:MATURE

如何得到这个输出?

【问题讨论】:

    标签: r data.table


    【解决方案1】:

    这是我仅使用 data.table 包的方法(无需创建 G2 也不需要)

    这里我们首先按swd 拆分,然后在stw 列中通过GNO: 之前的部分进行聚合

    G[, .(unlist(strsplit(sdw, ", ", TRUE))), GNO][, 
        .(stw = toString(V1)), .(GNO, sd = gsub(":.*", "", V1))]
    
     #    GNO    sd                   stw
     # 1:   1   SD1              SD1:LUSH
     # 2:   1  SD44           SD44:CANCEL
     # 3:   1 SD384           SD384:FR563
     # 4:   1  SD32          SD32:TRUMPET
     # 5:   2  SD12         SD12:PAINTING
     # 6:   2 SD315         SD315:MONEY31
     # 7:   2 SD387          SD387:SPRING
     # 8:   3  SD23           SD23:SWITCH
     # 9:   3   SD1              SD1:LUSH
    # 10:   3 SD567            SD567:TREK
    # 11:   4  SD32          SD32:TRUMPET
    # 12:   4   SD1             SD1:FIELD
    # 13:   4  SD40 SD40:GREEN, SD40:PARK
    # 14:   5 SD345            SD345:HOLE
    # 15:   5 SD340            SD340:DUST
    # 16:   5 SD387            SD387:ROLL
    # 17:   6  SD42           SD42:CRAYON
    # 18:   6 SD345             SD345:FOX
    # 19:   6 SD183            SD183:WIRE
    # 20:   7 SD455          SD455:TOMATO
    # 21:   7  SD86 SD86:RAY, SD86:COSMIC
    # 22:   7  SD39           SD39:MATURE
    

    【讨论】:

      【解决方案2】:

      这可能对您的数据性质做出过大的假设,但看起来“sd”列实际上有些无关紧要,因为该信息已经嵌入在“sdw”列中。

      因此,这就是我想出的,使用我的“splitstackshape”包中的两个cSplits:

      library(splitstackshape)
      temp <- cSplit(
        cSplit(
          G, "sdw", ",", "long"), 
        "sdw", ":")[, c(1, 3, 4)
                    ][, list(paste(unique(sdw_1), unique(sdw_2), 
                                   sep = ":", collapse = ", ")), 
                      by = list(GNO, sdw_1)]
      setnames(temp, c("GNO", "sd", "sdw"))
      temp
      #     GNO    sd                   sdw
      #  1:   1   SD1              SD1:LUSH
      #  2:   1  SD44           SD44:CANCEL
      #  3:   1 SD384           SD384:FR563
      #  4:   1  SD32          SD32:TRUMPET
      #  5:   2  SD12         SD12:PAINTING
      #  6:   2 SD315         SD315:MONEY31
      #  7:   2 SD387          SD387:SPRING
      #  8:   3  SD23           SD23:SWITCH
      #  9:   3   SD1              SD1:LUSH
      # 10:   3 SD567            SD567:TREK
      # 11:   4  SD32          SD32:TRUMPET
      # 12:   4   SD1             SD1:FIELD
      # 13:   4  SD40 SD40:GREEN, SD40:PARK
      # 14:   5 SD345            SD345:HOLE
      # 15:   5 SD340            SD340:DUST
      # 16:   5 SD387            SD387:ROLL
      # 17:   6  SD42           SD42:CRAYON
      # 18:   6 SD345             SD345:FOX
      # 19:   6 SD183            SD183:WIRE
      # 20:   7 SD455          SD455:TOMATO
      # 21:   7  SD86 SD86:RAY, SD86:COSMIC
      # 22:   7  SD39           SD39:MATURE
      #     GNO    sd                   sdw
      

      与你想要的输出比较

      library(compare)
      compare(out, temp, allowAll = TRUE)
      # TRUE 
      #   [GNO] coerced from <integer> to <numeric>
      #   [sdw] coerced from <character> to <factor>
      #   dropped attributes
      #   [3] dropped attributes
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 2016-04-15
        • 1970-01-01
        • 2016-08-07
        • 1970-01-01
        • 2013-08-11
        • 1970-01-01
        • 1970-01-01
        相关资源
        最近更新 更多