【问题标题】:Using separate from tidyr with different length vectors使用不同长度向量的 tidyr 分离
【发布时间】:2015-01-04 10:41:17
【问题描述】:

我想将一列字符串(例如 [1, 58, 10])分隔为使用与 tidyr 分开的列。我的问题是有时列更短(永远不会更长)。我在同一个数据框中有很多列存在这个问题。

加载包

require(tidyr)
require(dplyr)
require(stringr)

数据

在这里,我使用来自真实数据的样本制作了一个数据框。 “向量”在 col1 中的长度为 10,在 col2 中的长度为 9 或 10。有一个时间列只是为了表明还有其他列。

df <- data.frame(
        time = as.POSIXct(1:5, origin=Sys.time()),
        col1 = c("[0,355,0,0,0,1227,0,0,382059,116]", "[0,31,0,0,0,5,0,0,925,1]", "[0,1,0,0,0,471,0,0,130339,3946]", "[0,0,0,0,0,223,0,0,37666,12]", "[0,19,0,0,0,667,0,0,336956,53]"),
        col2 = c("[0,355,0,0,0,1227,0,0,382059,116]", "[0,355,0,0,0,1227,0,0,382059,116]", "[0,0,0,0,0,223,0,0,37666,12]", "[0,19,0,0,0,667,0,0,336956]","[0,355,0,0,0,1227,0,0,382059,116]")
    )

我想要的样子

对于所有“向量”长度相等的第一列,我可以使用 separate() 来获得我想要的。

a1 <- df %>% 
    mutate(col1 = str_sub(col1,2,-2)) %>%
    separate(col1, paste("col1",1:10,sep="."),",")

# Making sure the numbers are numeric
a1 <- as.data.frame(sapply(a1, as.numeric)) %>%
    mutate(time = as.POSIXct(time, origin="1970-01-01")) %>% select(-col2)

这会导致

> a1
                 time col1.1 col1.2 col1.3 col1.4 col1.5 col1.6 col1.7 col1.8
1 2014-11-07 12:21:45      0    355      0      0      0   1227      0      0
2 2014-11-07 12:21:46      0     31      0      0      0      5      0      0
3 2014-11-07 12:21:47      0      1      0      0      0    471      0      0
4 2014-11-07 12:21:48      0      0      0      0      0    223      0      0
5 2014-11-07 12:21:49      0     19      0      0      0    667      0      0
  col1.9 col1.10
1 382059     116
2    925       1
3 130339    3946
4  37666      12
5 336956      53

这不适用于 col2 元素不能分成几列的地方

解决方法

# Does not work
#b1 <- df %>% 
#   mutate(col2 = str_sub(col1,2,-2)) %>%
#   separate(col2, paste("col2",1:10,sep="."),",")

b2 <- sapply(as.data.frame(str_split_fixed(str_sub(df$col2,2,-2),',',n=10), stringsAsFactors=F), as.numeric) 
colnames(b2) <- paste("col2",1:10,sep=".")
b2 <- as.data.frame(cbind(time=df$time, b2)) %>%
    mutate(time = as.POSIXct(time, origin="1970-01-01"))

结果

> b2
                 time col2.1 col2.2 col2.3 col2.4 col2.5 col2.6 col2.7 col2.8
1 2014-11-07 12:21:45      0    355      0      0      0   1227      0      0
2 2014-11-07 12:21:46      0    355      0      0      0   1227      0      0
3 2014-11-07 12:21:47      0      0      0      0      0    223      0      0
4 2014-11-07 12:21:48      0     19      0      0      0    667      0      0
5 2014-11-07 12:21:49      0    355      0      0      0   1227      0      0
  col2.9 col2.10
1 382059     116
2 382059     116
3  37666      12
4 336956      NA
5 382059     116

如果向量较短,最后的元素应该是NA,所以这是正确的。

问题

有没有办法使用单独的(或其他更简单的功能)而不是解决方法? 有没有办法同时将其应用于 col1 和 col2(例如,通过选择以 col 开头的列)?

谢谢!

【问题讨论】:

    标签: r dplyr stringr tidyr


    【解决方案1】:

    这仅回答了您关于separate 的问题的第一部分。在separate 中有一个extra 参数(至少在tidyr 的开发版本中),如果您将extra 设置为"merge",它将允许您做您想做的事情。

    df %>% 
        mutate(col2 = str_sub(col2,2,-2)) %>%
        separate(col2, paste("col2",1:10,sep="."), ",", extra = "merge")
    
                     time                              col1
    1 2014-11-07 08:00:59 [0,355,0,0,0,1227,0,0,382059,116]
    2 2014-11-07 08:01:00          [0,31,0,0,0,5,0,0,925,1]
    3 2014-11-07 08:01:01   [0,1,0,0,0,471,0,0,130339,3946]
    4 2014-11-07 08:01:02      [0,0,0,0,0,223,0,0,37666,12]
    5 2014-11-07 08:01:03    [0,19,0,0,0,667,0,0,336956,53]
      col2.1 col2.2 col2.3 col2.4 col2.5 col2.6 col2.7 col2.8
    1      0    355      0      0      0   1227      0      0
    2      0    355      0      0      0   1227      0      0
    3      0      0      0      0      0    223      0      0
    4      0     19      0      0      0    667      0      0
    5      0    355      0      0      0   1227      0      0
      col2.9 col2.10
    1 382059     116
    2 382059     116
    3  37666      12
    4 336956    <NA>
    5 382059     116
    

    【讨论】:

    • 不错的解决方案。我不知道tidyr 中的extra。好东西。 +1
    【解决方案2】:

    这是使用dplyrsplitstackshape 的另一种方式。如果您不需要数据框,则不需要最后一个 data.frame(.)。您将拥有 data.table。

    df %>%
        mutate_each(funs(gsub("\\[(.*)\\]", "\\1", .)), contains("col")) %>%
        cSplit(., c("col1", "col2"), sep = ",") %>%
        mutate_each(funs(as.numeric), -time) %>%
        data.frame(.)
    
    
    #                 time col1_01 col1_02 col1_03 col1_04 col1_05 col1_06 col1_07 col1_08 col1_09 col1_10 col2_01 col2_02 col2_03 col2_04 col2_05
    #1 2014-11-08 00:48:15       0     355       0       0       0    1227       0       0  382059     116       0     355       0       0       0
    #2 2014-11-08 00:48:16       0      31       0       0       0       5       0       0     925       1       0     355       0       0       0
    #3 2014-11-08 00:48:17       0       1       0       0       0     471       0       0  130339    3946       0       0       0       0       0
    #4 2014-11-08 00:48:18       0       0       0       0       0     223       0       0   37666      12       0      19       0       0       0
    #5 2014-11-08 00:48:19       0      19       0       0       0     667       0       0  336956      53       0     355       0       0       0
    
    #  col2_06 col2_07 col2_08 col2_09 col2_10
    #1    1227       0       0  382059     116
    #2    1227       0       0  382059     116
    #3     223       0       0   37666      12
    #4     667       0       0  336956      NA
    #5    1227       0       0  382059     116
    

    【讨论】:

      【解决方案3】:

      没有包的解决方案:

      df <- data.frame(
        time = as.POSIXct(1:5, origin=Sys.time()),
        col1 = c("[0,355,0,0,0,1227,0,0,382059,116]", "[0,31,0,0,0,5,0,0,925,1]", "[0,1,0,0,0,471,0,0,130339,3946]", "[0,0,0,0,0,223,0,0,37666,12]", "[0,19,0,0,0,667,0,0,336956,53]"),
        col2 = c("[0,355,0,0,0,1227,0,0,382059,116]", "[0,355,0,0,0,1227,0,0,382059,116]", "[0,0,0,0,0,223,0,0,37666,12]", "[0,19,0,0,0,667,0,0,336956]","[0,355,0,0,0,1227,0,0,382059,116]")
      )
      
      df[-1] <- lapply(df[-1], function(x) gsub('\\[|\\]', '', as.character(x)))
      
      df <- read.csv(text = apply(as.matrix(df), 1, 
                                  function(x) paste0(x, collapse = ',')),
                     check.names = FALSE, header = FALSE,
                     colClasses = c('POSIXct', rep('numeric', 20)))
      names(df) <- c('time', paste0('col1.', 1:10), paste0('col2.', 1:10))
      
      # time col1.1 col1.2 col1.3 col1.4 col1.5 col1.6 col1.7 col1.8
      # 1 2014-11-07 10:53:22      0    355      0      0      0   1227      0      0
      # 2 2014-11-07 10:53:23      0     31      0      0      0      5      0      0
      # 3 2014-11-07 10:53:24      0      1      0      0      0    471      0      0
      # 4 2014-11-07 10:53:25      0      0      0      0      0    223      0      0
      # 5 2014-11-07 10:53:26      0     19      0      0      0    667      0      0
      # col1.9 col1.10 col2.1 col2.2 col2.3 col2.4 col2.5 col2.6 col2.7 col2.8 col2.9
      # 1 382059     116      0    355      0      0      0   1227      0      0 382059
      # 2    925       1      0    355      0      0      0   1227      0      0 382059
      # 3 130339    3946      0      0      0      0      0    223      0      0  37666
      # 4  37666      12      0     19      0      0      0    667      0      0 336956
      # 5 336956      53      0    355      0      0      0   1227      0      0 382059
      # col2.10
      # 1     116
      # 2     116
      # 3      12
      # 4      NA
      # 5     116
      

      【讨论】:

      • 令人惊讶...非常好! +1
      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2017-07-21
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多