【问题标题】:How to efficiently apply the rbinom function to each row in a data frame?如何有效地将 rbinom 函数应用于数据框中的每一行?
【发布时间】:2026-01-16 18:15:02
【问题描述】:

给定一个包含不同变量的计数和变化率的数据表,我如何从给定比率的每个变量的计数中采样?例如,给定以下数据表,我可以循环并使用 sample 或 rbinorm 函数来获得所需的输出。但是,我尝试在其上实现此功能的数据集非常大。有没有提高性能的方法?

library(data.table)
set.seed(1)

dt <- data.table(
count = sample(10000:20000, 100),
rate = sample(1:20, 100, replace = T) / 1000
)

system.time(
for (i in 1:nrow(dt)){
  dt$sample_n[i] <- sum(sample(1:0, 
                           dt$count[i], 
                           prob = c(dt$rate[i], 1-dt$rate[i]), 
                           replace = T))
}
)

system.time(
for (i in 1:nrow(dt)){
  dt$sample_n2[i] <- rbinom(size = dt$count[i], n = 1, prob = dt$rate[i])
}
)

【问题讨论】:

    标签: r performance data.table vectorization sample


    【解决方案1】:

    所有的采样函数通常都是向量化的,也就是说你可以直接做:

    dt$sample_n2 <- rbinom(size = dt$count, n = nrow(dt), prob = dt$rate)
    

    【讨论】:

      【解决方案2】:

      使用:= 引用分配,仅使用rbinom()(无循环)。

      设置

      library(data.table)
      options(datatable.print.class = TRUE)
      
      sample_size <- 5e4
      
      dt <- data.table(
        count = sample(seq(10000, 10000 + sample_size), size = sample_size),
        rate = sample(1:20, size = sample_size, replace = TRUE) / 1000
      )
      

      解决方案

      dt[, sample_n := rbinom(n = .N, size = dt$count, prob = rate)]
      dt
      #>        count  rate sample_n
      #>        <int> <num>    <int>
      #>     1: 26100 0.016      431
      #>     2: 15145 0.008      114
      #>     3: 24952 0.001       23
      #>     4: 31437 0.020      621
      #>     5: 58358 0.008      468
      #>    ---                     
      #> 49996: 30517 0.002       56
      #> 49997: 59047 0.009      500
      #> 49998: 48737 0.018      896
      #> 49999: 29686 0.005      152
      #> 50000: 52429 0.011      580
      

      枪战

      results <- list()
      
      set.seed(1)
      dt <- data.table(
        count = sample(seq(10000, 10000 + sample_size), size = sample_size),
        rate = sample(1:20, size = sample_size, replace = TRUE) / 1000
      )
      

      原始时序

      results$original1_no_modify <- system.time( # not modifying `dt`
      for (i in 1:nrow(dt)) {
        sum(
          sample(1:0, dt$count[i], prob = c(dt$rate[i], 1L - dt$rate[i]), replace = TRUE)
        )
      }
      )
      
      set.seed(1)
      results$original1_modify <- system.time( # modifying `dt`
      for (i in 1:nrow(dt)) {
        dt$sample_n[i] <- sum(
          sample(1:0, dt$count[i], prob = c(dt$rate[i], 1L - dt$rate[i]), replace = TRUE)
        )
      }
      )
      
      
      results$original2_no_modify <- system.time( # not modifying `dt`
      for (i in 1:nrow(dt)){
        rbinom(size = dt$count[i], n = 1L, prob = dt$rate[i])
      }
      )
      
      set.seed(1)
      results$original2_modify <- system.time( # modifying `dt`
      for (i in 1:nrow(dt)){
        dt$sample_n2[i] <- rbinom(size = dt$count[i], n = 1L, prob = dt$rate[i])
      }
      )
      

      := + mapply() + rbinom()(更快,但仍然是 R 级迭代)

      results$mapply_no_modify <- system.time( # not modifying `dt`
      mapply(
        function(.count, .rate) rbinom(size = .count, n = 1L, prob = .rate),
        dt$count, dt$rate 
      )
      )
      
      set.seed(1)
      results$mapply_modify <- system.time( # modifying `dt`
      dt[, sample_n3 := mapply(
        function(.count, .rate) rbinom(size = .count, n = 1L, prob = .rate),
        count, rate 
      )]
      )
      

      解决方案

      results$solution_no_modify <- system.time( # not modifing `dt`
      rbinom(n = nrow(dt), size = dt$count, prob = dt$rate)
      )
      
      set.seed(1)
      results$solution_modify <- system.time(
      dt[, sample_n4 := rbinom(n = .N, size = dt$count, prob = rate)]
      )
      

      最终数据帧

      dt[]
      #>        count  rate sample_n sample_n2 sample_n3 sample_n4
      #>        <int> <num>    <int>     <int>     <int>     <int>
      #>     1: 34387 0.009      295       310       310       310
      #>     2: 53306 0.019     1076      1004      1004      1004
      #>     3: 14049 0.019      268       247       247       247
      #>     4: 21570 0.002       45        55        55        55
      #>     5: 35172 0.009      313       346       346       346
      #>    ---                                                   
      #> 49996: 37432 0.020      724       722       722       722
      #> 49997: 14985 0.006       82        76        76        76
      #> 49998: 16007 0.007      107       106       106       106
      #> 49999: 49298 0.003      145       140       140       140
      #> 50000: 41427 0.001       49        40        40        40
      

      健全性检查

      stopifnot(
        identical(dt$sample_n2, dt$sample_n3) &&
        identical(dt$sample_n3, dt$sample_n4)
      )
      

      结果

      results
      #> $original1_no_modify
      #>    user  system elapsed 
      #>  18.713   0.568  19.288 
      #> 
      #> $original1_modify
      #>    user  system elapsed 
      #>  28.217   0.020  28.237 
      #> 
      #> $original2_no_modify
      #>    user  system elapsed 
      #>   0.155   0.000   0.155 
      #> 
      #> $original2_modify
      #>    user  system elapsed 
      #>   9.085   0.152   9.237 
      #> 
      #> $mapply_no_modify
      #>    user  system elapsed 
      #>   0.139   0.000   0.139 
      #> 
      #> $mapply_modify
      #>    user  system elapsed 
      #>   0.132   0.000   0.131 
      #> 
      #> $solution_no_modify
      #>    user  system elapsed 
      #>   0.004   0.000   0.004 
      #> 
      #> $solution_modify
      #>    user  system elapsed 
      #>   0.004   0.000   0.004
      rbindlist(lapply(results, as.list), idcol = "approach")
      #>               approach user.self sys.self elapsed user.child sys.child
      #>                 <char>     <num>    <num>   <num>      <num>     <num>
      #> 1: original1_no_modify    18.713    0.568  19.288          0         0
      #> 2:    original1_modify    28.217    0.020  28.237          0         0
      #> 3: original2_no_modify     0.155    0.000   0.155          0         0
      #> 4:    original2_modify     9.085    0.152   9.237          0         0
      #> 5:    mapply_no_modify     0.139    0.000   0.139          0         0
      #> 6:       mapply_modify     0.132    0.000   0.131          0         0
      #> 7:  solution_no_modify     0.004    0.000   0.004          0         0
      #> 8:     solution_modify     0.004    0.000   0.004          0         0
      

      【讨论】: