【问题标题】:What is the most efficient way to paste strings in R?在 R 中粘贴字符串的最有效方法是什么?
【发布时间】:2019-05-02 12:42:40
【问题描述】:

我有两个非常大的向量,我需要将它们与分隔符连接起来以形成唯一的 ID。例如:

set.seed(1)

vec1 <- sample(1:10, 10000000, replace = T)
vec2 <- sample(1:1000000000, 10000000))

我目前正在使用 paste0():

system.time({    

uniq_id <- paste0(vec1, "_", vec2)

})

但是,由于 vec1 和 vec2 的大小,这非常慢。有没有性能更好的替代方法?

【问题讨论】:

标签: r performance paste


【解决方案1】:

更有效的方法是stringi::stri_c

library(microbenchmark)
b <- microbenchmark(
  paste = paste0(vec1, "_", vec2),
  stringi = stringi::stri_c(vec1, vec2, sep = "_"),
  times = 10
)

结果

b
#Unit: seconds
#    expr      min       lq     mean   median       uq      max neval cld
#   paste 5.475398 5.509957 5.544477 5.542728 5.566904 5.632173    10   b
# stringi 3.862541 3.871826 3.896242 3.897264 3.914894 3.934175    10  a 

【讨论】:

    【解决方案2】:

    比较pastepaste0(R 版本 4.1.0)、stringi::stri_c(版本 1.6.2)和stringr::str_c(版本 1.4.0)我无法观察到性能上有多大差异,但也许这会取决于将要连接的内容。如果使用数字或字符以及字符由数字或字母组成,则会有很大的不同。当只有字母 stringi 和 stringr 时,接缝比粘贴快。

    M <- alist(
        paste0 = paste0(vec1, "_", vec2)
      , paste = paste(vec1, "_", vec2, sep = "")
      , pasteS = paste(vec1, vec2, sep = "_")
      , stringi = stringi::stri_c(vec1, "_", vec2)
      , stringiS = stringi::stri_c(vec1, vec2, sep = "_")
      , stringr = stringr::str_c(vec1, "_", vec2)
      , stringrS = stringr::str_c(vec1, vec2, sep = "_")
    )
    
    set.seed(42)
    n <- 1e5
    vec1 <- sample(1:10, n, TRUE)
    vec2 <- sample(1:1000000000, n, TRUE)
    bench::mark(exprs = M)
    #  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
    #  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
    #1 paste0      62.8ms  63.9ms      15.6    2.29MB     2.23     7     1      447ms
    #2 paste       61.9ms    63ms      15.9    2.29MB     0        8     0      503ms
    #3 pasteS      57.5ms  58.6ms      17.1    2.29MB     2.13     8     1      468ms
    #4 stringi     57.1ms  57.6ms      17.2    2.29MB     0        9     0      524ms
    #5 stringiS    56.2ms  66.2ms      14.4    2.29MB     2.40     6     1      417ms
    #6 stringr     57.9ms  62.9ms      14.8    2.29MB     0        8     0      541ms
    #7 stringrS      55ms  61.4ms      15.3    2.29MB     0        8     0      523ms
    
    vec1 <- as.character(vec1)
    vec2 <- as.character(vec2)
    bench::mark(exprs = M)
    #  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
    #  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
    #1 paste0      34.2ms  35.3ms      28.2     781KB     2.17    13     1      460ms
    #2 paste       35.1ms  35.7ms      27.9     781KB     0       14     0      502ms
    #3 pasteS        32ms  33.5ms      29.9     781KB     2.14    14     1      468ms
    #4 stringi     33.7ms  35.6ms      28.1     781KB     0       15     0      534ms
    #5 stringiS    32.6ms  33.9ms      29.6     781KB     2.12    14     1      472ms
    #6 stringr     34.6ms  34.9ms      28.5     781KB     0       15     0      526ms
    #7 stringrS    33.1ms  33.4ms      29.7     781KB     2.12    14     1      471ms
    
    set.seed(42)
    n <- 1e5
    vec1 <- as.character(sample(0:9, n, TRUE))
    vec2 <- as.character(sample(0:9, n, TRUE))
    bench::mark(exprs = M)
    #  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
    #  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
    #1 paste0      18.9ms    19ms      52.4     781KB     2.02    26     1      496ms
    #2 paste       18.9ms    19ms      52.5     781KB     0       27     0      514ms
    #3 pasteS      15.2ms  15.3ms      65.3     781KB     2.04    32     1      490ms
    #4 stringi     15.1ms  15.1ms      65.7     781KB     0       33     0      502ms
    #5 stringiS    13.5ms  13.5ms      73.7     781KB     2.05    36     1      489ms
    #6 stringr     15.1ms  15.2ms      65.7     781KB     2.05    32     1      487ms
    #7 stringrS    13.4ms  13.5ms      73.3     781KB     0       37     0      505ms
    
    set.seed(42)
    n <- 1e5
    vec1 <- paste(sample(0:9, n, TRUE))
    vec2 <- paste(sample(0:9, n, TRUE))
    bench::mark(exprs = M)
      expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
      <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
    #1 paste0     18.95ms 19.18ms      52.1     781KB     0       27     0      518ms
    #2 paste      18.78ms 18.98ms      52.6     781KB     2.02    26     1      494ms
    #3 pasteS     14.29ms 14.49ms      69.0     781KB     0       35     0      508ms
    #4 stringi      9.6ms  9.83ms     101.      781KB     2.02    50     1      495ms
    #5 stringiS    7.55ms  7.73ms     127.      781KB     2.01    63     1      496ms
    #6 stringr     9.58ms  9.75ms     101.      781KB     2.03    50     1      493ms
    #7 stringrS    7.54ms  7.77ms     127.      781KB     2.02    63     1      496ms
    
    set.seed(42)
    n <- 1e5
    vec1 <- sample(letters, n, TRUE)
    vec2 <- sample(LETTERS, n, TRUE)
    bench::mark(exprs = M)
    #  expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
    #  <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
    #1 paste0     15.98ms 16.02ms      61.5     781KB     2.05    30     1      488ms
    #2 paste      16.02ms 16.09ms      62.1     781KB     2.07    30     1      483ms
    #3 pasteS     11.96ms 12.03ms      83.0     781KB     2.02    41     1      494ms
    #4 stringi     7.97ms  8.07ms     123.      781KB     4.18    59     2      478ms
    #5 stringiS    6.37ms  6.43ms     154.      781KB     4.12    75     2      486ms
    #6 stringr     7.97ms  8.02ms     124.      781KB     2.04    61     1      491ms
    #7 stringrS    6.43ms  6.49ms     153.      781KB     4.09    75     2      489ms
    

    差异取决于character 的内部存储方式。 CHARSXPREALSXPINTSXP

    x <- as.character(1:2)
    .Internal(inspect(x))
    #@55d9df5270d8 16 STRSXP g0c0 [REF(1)]   <deferred string conversion>
    #  @55d9df527180 13 INTSXP g0c0 [REF(65535)]  1 : 2 (compact)
    
    x <- as.character(c(0,2))
    .Internal(inspect(x))
    #@55d9df5430a0 16 STRSXP g0c0 [REF(1)]   <deferred string conversion>
    #  @55d9df6720a8 14 REALSXP g0c2 [REF(65535)] (len=2, tl=0) 0,2
    
    x <- paste(1:2)
    .Internal(inspect(x))
    #@55d9df610d08 16 STRSXP g0c2 [REF(1)] (len=2, tl=0)
    #  @55d9d2e30458 09 CHARSXP g1c1 [MARK,REF(40995),gp=0x61] [ASCII] [cached] "1"
    #  @55d9d2e58b00 09 CHARSXP g1c1 [MARK,REF(40555),gp=0x60] [ASCII] [cached] "2"
    
    x <- letters[1:2]
    .Internal(inspect(x))
    #@55d9df672168 16 STRSXP g0c2 [REF(1)] (len=2, tl=0)
    #  @55d9d2c80518 09 CHARSXP g1c1 [MARK,REF(541),gp=0x61] [ASCII] [cached] "a"
    #  @55d9d2fb7d58 09 CHARSXP g1c1 [MARK,REF(44),gp=0x61] [ASCII] [cached] "b"
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2016-07-28
      • 2015-07-14
      • 2013-05-17
      • 1970-01-01
      • 1970-01-01
      • 2010-11-21
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多