【问题标题】:Merge columns with multiple delimiters合并具有多个分隔符的列
【发布时间】:2022-01-17 19:46:46
【问题描述】:

如何合并具有可变数量分隔符的列,以便获得类似于输出的内容(假设所有内容都是字符)?

 dt1
   letter
1       a
2     b+c
3       c
4 d+e+f+g
5   a+g+e

 dt2
  letter number
1      a      1
2      b      2
3      c      3
4      d      4
5      e      5
6      f      6
7      g      7

> output
   letter  number
1       a       1
2     b+c     2+3
3       c       3
4 d+e+f+g 4+5+6+7
5   a+g+e   1+7+5
dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))

dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"))

output<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), number=c("1","2+3","3","4+5+6+7","1+7+5"))

【问题讨论】:

    标签: r dplyr replace merge csplit


    【解决方案1】:

    一种快速易读的library(stringi) 方法:

    library(stringi)    
    
    dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
    dt2 <- data.frame(letter=c("a","b","c","d","e","f","g"), number=c("1","2","3","4","5","6","7"))
    
    dt1$number <- stri_replace_all_fixed(dt1$letter, pattern = dt2$letter, replacement = dt2$number, vectorize_all = FALSE)
    
    dt1
    

    结果:

    > dt1
       letter  number
    1       a       1
    2     b+c     2+3
    3       c       3
    4 d+e+f+g 4+5+6+7
    5   a+g+e   1+7+5
    

    另见related answer


    编辑:当前可用答案的基准:

    Unit: microseconds
                 expr     min      lq    mean  median      uq     max neval
                Sotos  2689.6  2689.6  2689.6  2689.6  2689.6  2689.6     1
        ismirsehregal    26.4    26.4    26.4    26.4    26.4    26.4     1
                  www 42247.8 42247.8 42247.8 42247.8 42247.8 42247.8     1
     MerijnvanTilborg  1723.5  1723.5  1723.5  1723.5  1723.5  1723.5     1
        YuriySaraykin 21859.2 21859.2 21859.2 21859.2 21859.2 21859.2     1
              danlooo  4165.7  4165.7  4165.7  4165.7  4165.7  4165.7     1
    

    重现基准:

    library(microbenchmark)
    library(tidyverse)
    library(stringi)  
    
    dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
    dt2 <- data.frame(letter=c("a","b","c","d","e","f","g"), number=c("1","2","3","4","5","6","7"))
    
    microbenchmark(
      Sotos = {
        sapply(strsplit(dt1$letter, '+', fixed = TRUE), function(i)
          paste(dt2$number[dt2$letter %in% i], collapse = '+'))
      },
      ismirsehregal = {
        stri_replace_all_fixed(
          dt1$letter,
          pattern = dt2$letter,
          replacement = dt2$number,
          vectorize_all = FALSE
        )
      },
      www = {
        dt1 %>%
          mutate(ID = 1:n()) %>%
          separate_rows(letter, sep = "\\+") %>%
          left_join(dt2, by = "letter") %>%
          group_by(ID) %>%
          summarize(across(.fns = ~ paste0(., collapse = "+"))) %>%
          ungroup() %>%
          select(-ID)
      },
      MerijnvanTilborg = {
        dt1 %>% mutate(MerijnvanTilborg = stri_replace_all_regex(letter, dt2$letter, dt2$number, vectorize_all = F))
      },
      YuriySaraykin = {
        dt1 %>%
          rowwise() %>%
          mutate(tmp = str_split(letter, pattern = "\\+")) %>%
          ungroup() %>%
          mutate(number = map_chr(tmp, ~ paste0(match(.x, dt2$letter), collapse = "+"))) %>%
          select(-tmp)
      },
      danlooo = {
        dt1 %>%
          as_tibble() %>%
          mutate(number = letter %>% map_chr(
            ~ .x %>%
              str_split("[+]") %>%
              simplify() %>%
              map_chr( ~ deframe(dt2)[.x]) %>%
              paste0(collapse = "+")
          ))
      },
      times = 1L
    )
    

    【讨论】:

      【解决方案2】:

      无需拆分任何数据,因为您只需要将特定字母替换为特定数字。

      dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), stringsAsFactors = F)
      dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"), stringsAsFactors = F)
      
      library(stringi)
      
      dt1 %>% mutate(number = stri_replace_all_regex(letter, dt2$letter, dt2$number, vectorize_all = F))
      
         letter  number
      1       a       1
      2     b+c     2+3
      3       c       3
      4 d+e+f+g 4+5+6+7
      5   a+g+e   1+7+5
      

      另一种解决方案可能更短

      dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), stringsAsFactors = F)
      
      v <- c("1","2","3","4","5","6","7")
      names(v) <- c("a","b","c","d","e","f","g")
      
      dt1 %>% mutate(number = str_replace_all(letter, v))
      

      【讨论】:

        【解决方案3】:
        dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
        
        dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"))
        
        library(tidyverse)
        dt1 %>% 
          rowwise() %>% 
          mutate(tmp = str_split(letter, pattern = "\\+")) %>% 
          ungroup() %>% 
          mutate(number = map_chr(tmp, ~paste0(match(.x, dt2$letter), collapse = "+"))) %>% 
          select(-tmp)
        #> # A tibble: 5 x 2
        #>   letter  number 
        #>   <chr>   <chr>  
        #> 1 a       1      
        #> 2 b+c     2+3    
        #> 3 c       3      
        #> 4 d+e+f+g 4+5+6+7
        #> 5 a+g+e   1+7+5
        

        reprex package (v2.0.1) 于 2021 年 12 月 14 日创建

        【讨论】:

          【解决方案4】:

          使用tidyverse 的解决方案。

          library(tidyverse)
          
          output <- dt1 %>%
            mutate(ID = 1:n()) %>%
            separate_rows(letter, sep = "\\+") %>%
            left_join(dt2, by = "letter") %>%
            group_by(ID) %>%
            summarize(across(.fns = ~paste0(., collapse = "+"))) %>%
            ungroup() %>%
            select(-ID)
          output
          # # A tibble: 5 x 2
          #   letter  number 
          #   <chr>   <chr>  
          # 1 a       1      
          # 2 b+c     2+3    
          # 3 c       3      
          # 4 d+e+f+g 4+5+6+7
          # 5 a+g+e   1+7+5 
          

          【讨论】:

            【解决方案5】:

            一个基本的 R 解决方案可以是,

            dt1$res <- sapply(strsplit(dt1$letter, '+', fixed = TRUE), function(i)paste(dt2$number[dt2$letter %in% i], collapse = '+'))
            
            #   letter     res
            #1       a       1
            #2     b+c     2+3
            #3       c       3
            #4 d+e+f+g 4+5+6+7
            #5   a+g+e   1+5+7
            

            【讨论】:

            • 是否有机会改进代码,使其实际上保持合并因素的顺序?我的问题实际上是一个非常复杂的数据集,一些单元格会导致,例如:字母=d+e+f+g 和 res=5+4+6+7。我相信这一定与 dt2 中事物的排序方式有关。 @Sotos
            【解决方案6】:
            library(tidyverse)
            dt1 <- data.frame(letter = c("a", "b+c", "c", "d+e+f+g", "a+g+e"))
            dt2 <- data.frame(letter = c("a", "b", "c", "d", "e", "f", "g"),
              number = c("1", "2", "3", "4", "5", "6", "7"))
            
            dt1 %>%
              as_tibble() %>%
              mutate(
                number = letter %>% map_chr(~ .x %>%
                  str_split("[+]") %>%
                  simplify() %>%
                  map_chr(~ deframe(dt2)[.x]) %>%
                  paste0(collapse = "+")
                )
              )
            #> # A tibble: 5 x 2
            #>   letter  number
            #>   <chr>   <chr>  
            #> 1 a       1      
            #> 2 b+c     2+3    
            #> 3 c       3      
            #> 4 d+e+f+g 4+5+6+7
            #> 5 a+g+e   1+7+5
            

            reprex package (v2.0.1) 于 2021 年 12 月 14 日创建

            【讨论】:

              猜你喜欢
              • 2020-11-05
              • 2022-01-18
              • 1970-01-01
              • 2015-07-25
              • 1970-01-01
              • 1970-01-01
              • 2013-05-23
              • 2017-05-07
              • 1970-01-01
              相关资源
              最近更新 更多