【问题标题】:Separate names and insert in new columns in R分隔名称并在 R 中插入新列
【发布时间】:2017-01-17 16:09:32
【问题描述】:

我想将数据框中的每个名称分开并将其插入相应的列中。下面是一个例子: 采用以下数据框:

    N0H0 <-c(rep("N0H0",3))
    N1H1 <-c(rep("N1H1",5))
    N0H05<-c(rep("N0H05",4))
    Out20<-c(rep("20_Out",2))
    Out10<-c(rep("10_Out",4))
    In5  <-c(rep("5_In",3))
    In15 <-c(rep("15_In",3))

    df <- data.frame (Field =c(N0H0,N1H1,N0H05),
              Border =c(Out20,Out10,In5,In15),
              N      = NA,
              H      = NA,
              Number = NA,
              Out.In = NA
                )

并希望获得以下输出:

     output <-  data.frame (Field = c(N0H0,N1H1,N0H05),
                    Border = c(Out20,Out10,In5,In15),
                    N = c(c(rep("N0",3)),c(rep("N1",5)),c(rep("N0",4))),
                    H = c(c(rep("H0",3)),c(rep("H1",5)),c(rep("H05",4))),
                    Number = c(c(rep("20",2)),c(rep("10",4)),c(rep("5",3)),c(rep("15",3))),
                    Out.In = c(c(rep("Out",6)),c(rep("In",6))))

【问题讨论】:

    标签: r dataframe split pattern-matching


    【解决方案1】:

    我们可以使用extractseparatetidyr 来分隔列

    library(dplyr)
    library(tidyr)
    df %>% 
       select(Field, Border) %>%
       extract(Field, into = c("N", "H"), "^([^0-9]*\\d+)(.*)", remove = FALSE) %>% 
       separate(Border, into = c("Number", "Out.In"), remove = FALSE) %>%
       select_(.dots = names(output))
    #    Field Border  N   H Number Out.In
    #1   N0H0 20_Out N0  H0     20    Out
    #2   N0H0 20_Out N0  H0     20    Out
    #3   N0H0 10_Out N0  H0     10    Out
    #4   N1H1 10_Out N1  H1     10    Out
    #5   N1H1 10_Out N1  H1     10    Out
    #6   N1H1 10_Out N1  H1     10    Out
    #7   N1H1   5_In N1  H1      5     In
    #8   N1H1   5_In N1  H1      5     In
    #9  N0H05   5_In N0 H05      5     In
    #10 N0H05  15_In N0 H05     15     In
    #11 N0H05  15_In N0 H05     15     In
    #12 N0H05  15_In N0 H05     15     In
    

    或使用base R,使用lapply 循环前两列,使用sub 创建分隔符,使用read.csvcbind 读取字符串list 元素并将其分配回列除了前 2 个

    df[-(1:2)] <-  do.call(cbind, lapply(df[1:2], 
            function(x) read.csv(text=sub("(\\d+)_*", "\\1,", x), 
            header=FALSE, stringsAsFactors=FALSE)))
    df
    #   Field Border  N   H Number Out.In
    #1   N0H0 20_Out N0  H0     20    Out
    #2   N0H0 20_Out N0  H0     20    Out
    #3   N0H0 10_Out N0  H0     10    Out
    #4   N1H1 10_Out N1  H1     10    Out
    #5   N1H1 10_Out N1  H1     10    Out
    #6   N1H1 10_Out N1  H1     10    Out
    #7   N1H1   5_In N1  H1      5     In
    #8   N1H1   5_In N1  H1      5     In
    #9  N0H05   5_In N0 H05      5     In
    #10 N0H05  15_In N0 H05     15     In
    #11 N0H05  15_In N0 H05     15     In
    #12 N0H05  15_In N0 H05     15     In
    

    【讨论】:

      【解决方案2】:

      使用base R函数你可以试试:

      output <- df
      output[, 5:6] <- do.call(rbind, strsplit(as.character(df$Border), "_"))
      output[, 3:4] <- do.call(rbind, strsplit(as.character(df$Field), "H"))
      output$H <- paste0("H", output$H)
      output
         Field Border  N   H Number Out.In
      1   N0H0 20_Out N0  H0     20    Out
      2   N0H0 20_Out N0  H0     20    Out
      3   N0H0 10_Out N0  H0     10    Out
      4   N1H1 10_Out N1  H1     10    Out
      5   N1H1 10_Out N1  H1     10    Out
      6   N1H1 10_Out N1  H1     10    Out
      7   N1H1   5_In N1  H1      5     In
      8   N1H1   5_In N1  H1      5     In
      9  N0H05   5_In N0 H05      5     In
      10 N0H05  15_In N0 H05     15     In
      11 N0H05  15_In N0 H05     15     In
      12 N0H05  15_In N0 H05     15     In
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2019-07-01
        • 1970-01-01
        相关资源
        最近更新 更多