更新:
Efraimidis & Spirakis 算法的Rcpp 实现(感谢@Hemmo、@Dinrem、@krlmlr 和@rtlgrmpf):
library(inline)
library(Rcpp)
src <-
'
int num = as<int>(size), x = as<int>(n);
Rcpp::NumericVector vx = Rcpp::clone<Rcpp::NumericVector>(x);
Rcpp::NumericVector pr = Rcpp::clone<Rcpp::NumericVector>(prob);
Rcpp::NumericVector rnd = rexp(x) / pr;
for(int i= 0; i<vx.size(); ++i) vx[i] = i;
std::partial_sort(vx.begin(), vx.begin() + num, vx.end(), Comp(rnd));
vx = vx[seq(0, num - 1)] + 1;
return vx;
'
incl <-
'
struct Comp{
Comp(const Rcpp::NumericVector& v ) : _v(v) {}
bool operator ()(int a, int b) { return _v[a] < _v[b]; }
const Rcpp::NumericVector& _v;
};
'
funFast <- cxxfunction(signature(n = "Numeric", size = "integer", prob = "numeric"),
src, plugin = "Rcpp", include = incl)
# See the bottom of the answer for comparison
p <- c(995/1000, rep(1/1000, 5))
n <- 100000
system.time(print(table(replicate(funFast(6, 3, p), n = n)) / n))
1 2 3 4 5 6
1.00000 0.39996 0.39969 0.39973 0.40180 0.39882
user system elapsed
3.93 0.00 3.96
# In case of:
# Rcpp::IntegerVector vx = Rcpp::clone<Rcpp::IntegerVector>(x);
# i.e. instead of NumericVector
1 2 3 4 5 6
1.00000 0.40150 0.39888 0.39925 0.40057 0.39980
user system elapsed
1.93 0.00 2.03
旧版本:
让我们尝试几种可能的方法:
带替换的简单拒绝抽样。这是一个比@krlmlr 提供的sample.int.rej 简单得多的功能,即样本大小始终等于n。正如我们将看到的,假设权重均匀分布,它仍然非常快,但在另一种情况下非常慢。
fastSampleReject <- function(all, n, w){
out <- numeric(0)
while(length(out) < n)
out <- unique(c(out, sample(all, n, replace = TRUE, prob = w)))
out[1:n]
}
Wong 和 Easton (1980) 的算法。这是this Python 版本的实现。它很稳定,我可能会遗漏一些东西,但与其他功能相比它要慢得多。
fastSample1980 <- function(all, n, w){
tws <- w
for(i in (length(tws) - 1):0)
tws[1 + i] <- sum(tws[1 + i], tws[1 + 2 * i + 1],
tws[1 + 2 * i + 2], na.rm = TRUE)
out <- numeric(n)
for(i in 1:n){
gas <- tws[1] * runif(1)
k <- 0
while(gas > w[1 + k]){
gas <- gas - w[1 + k]
k <- 2 * k + 1
if(gas > tws[1 + k]){
gas <- gas - tws[1 + k]
k <- k + 1
}
}
wgh <- w[1 + k]
out[i] <- all[1 + k]
w[1 + k] <- 0
while(1 + k >= 1){
tws[1 + k] <- tws[1 + k] - wgh
k <- floor((k - 1) / 2)
}
}
out
}
Wong 和 Easton 对算法的 Rcpp 实现。可能它可以进一步优化,因为这是我第一个可用的 Rcpp 函数,但无论如何它运行良好。
library(inline)
library(Rcpp)
src <-
'
Rcpp::NumericVector weights = Rcpp::clone<Rcpp::NumericVector>(w);
Rcpp::NumericVector tws = Rcpp::clone<Rcpp::NumericVector>(w);
Rcpp::NumericVector x = Rcpp::NumericVector(all);
int k, num = as<int>(n);
Rcpp::NumericVector out(num);
double gas, wgh;
if((weights.size() - 1) % 2 == 0){
tws[((weights.size()-1)/2)] += tws[weights.size()-1] + tws[weights.size()-2];
}
else
{
tws[floor((weights.size() - 1)/2)] += tws[weights.size() - 1];
}
for (int i = (floor((weights.size() - 1)/2) - 1); i >= 0; i--){
tws[i] += (tws[2 * i + 1]) + (tws[2 * i + 2]);
}
for(int i = 0; i < num; i++){
gas = as<double>(runif(1)) * tws[0];
k = 0;
while(gas > weights[k]){
gas -= weights[k];
k = 2 * k + 1;
if(gas > tws[k]){
gas -= tws[k];
k += 1;
}
}
wgh = weights[k];
out[i] = x[k];
weights[k] = 0;
while(k > 0){
tws[k] -= wgh;
k = floor((k - 1) / 2);
}
tws[0] -= wgh;
}
return out;
'
fun <- cxxfunction(signature(all = "numeric", n = "integer", w = "numeric"),
src, plugin = "Rcpp")
现在有一些结果:
times1 <- ldply(
1:6,
function(i) {
n <- 1024 * (2 ** i)
p <- runif(2 * n) # Uniform distribution
p <- p/sum(p)
data.frame(
n=n,
user=c(system.time(sample.int.test(n, p), gcFirst=T)['user.self'],
system.time(weighted_Random_Sample(1:(2*n), p, n), gcFirst=T)['user.self'],
system.time(fun(1:(2*n), n, p), gcFirst=T)['user.self'],
system.time(sample.int.rej(2*n, n, p), gcFirst=T)['user.self'],
system.time(fastSampleReject(1:(2*n), n, p), gcFirst=T)['user.self'],
system.time(fastSample1980(1:(2*n), n, p), gcFirst=T)['user.self']),
id=c("Base", "Reservoir", "Rcpp", "Rejection", "Rejection simple", "1980"))
},
.progress='text'
)
times2 <- ldply(
1:6,
function(i) {
n <- 1024 * (2 ** i)
p <- runif(2 * n - 1)
p <- p/sum(p)
p <- c(0.999, 0.001 * p) # Special case
data.frame(
n=n,
user=c(system.time(sample.int.test(n, p), gcFirst=T)['user.self'],
system.time(weighted_Random_Sample(1:(2*n), p, n), gcFirst=T)['user.self'],
system.time(fun(1:(2*n), n, p), gcFirst=T)['user.self'],
system.time(sample.int.rej(2*n, n, p), gcFirst=T)['user.self'],
system.time(fastSampleReject(1:(2*n), n, p), gcFirst=T)['user.self'],
system.time(fastSample1980(1:(2*n), n, p), gcFirst=T)['user.self']),
id=c("Base", "Reservoir", "Rcpp", "Rejection", "Rejection simple", "1980"))
},
.progress='text'
)
arrange(times1, id)
n user id
1 2048 0.53 1980
2 4096 0.94 1980
3 8192 2.00 1980
4 16384 4.32 1980
5 32768 9.10 1980
6 65536 21.32 1980
7 2048 0.02 Base
8 4096 0.05 Base
9 8192 0.18 Base
10 16384 0.75 Base
11 32768 2.99 Base
12 65536 12.23 Base
13 2048 0.00 Rcpp
14 4096 0.01 Rcpp
15 8192 0.03 Rcpp
16 16384 0.07 Rcpp
17 32768 0.14 Rcpp
18 65536 0.31 Rcpp
19 2048 0.00 Rejection
20 4096 0.00 Rejection
21 8192 0.00 Rejection
22 16384 0.02 Rejection
23 32768 0.02 Rejection
24 65536 0.03 Rejection
25 2048 0.00 Rejection simple
26 4096 0.01 Rejection simple
27 8192 0.00 Rejection simple
28 16384 0.01 Rejection simple
29 32768 0.00 Rejection simple
30 65536 0.05 Rejection simple
31 2048 0.00 Reservoir
32 4096 0.00 Reservoir
33 8192 0.00 Reservoir
34 16384 0.02 Reservoir
35 32768 0.03 Reservoir
36 65536 0.05 Reservoir
arrange(times2, id)
n user id
1 2048 0.43 1980
2 4096 0.93 1980
3 8192 2.00 1980
4 16384 4.36 1980
5 32768 9.08 1980
6 65536 19.34 1980
7 2048 0.01 Base
8 4096 0.04 Base
9 8192 0.18 Base
10 16384 0.75 Base
11 32768 3.11 Base
12 65536 12.04 Base
13 2048 0.01 Rcpp
14 4096 0.02 Rcpp
15 8192 0.03 Rcpp
16 16384 0.08 Rcpp
17 32768 0.15 Rcpp
18 65536 0.33 Rcpp
19 2048 0.00 Rejection
20 4096 0.00 Rejection
21 8192 0.02 Rejection
22 16384 0.02 Rejection
23 32768 0.05 Rejection
24 65536 0.08 Rejection
25 2048 1.43 Rejection simple
26 4096 2.87 Rejection simple
27 8192 6.17 Rejection simple
28 16384 13.68 Rejection simple
29 32768 29.74 Rejection simple
30 65536 73.32 Rejection simple
31 2048 0.00 Reservoir
32 4096 0.00 Reservoir
33 8192 0.02 Reservoir
34 16384 0.02 Reservoir
35 32768 0.02 Reservoir
36 65536 0.04 Reservoir
显然我们可以拒绝函数1980,因为在这两种情况下它都比Base 慢。 Rejection simple 在第二种情况下只有一个概率为 0.999 时也会遇到麻烦。
所以还有Rejection、Rcpp、Reservoir。最后一步是检查值本身是否正确。为确定它们,我们将使用sample 作为基准(同时消除由于无替换抽样而不必与p 重合的概率的混淆)。
p <- c(995/1000, rep(1/1000, 5))
n <- 100000
system.time(print(table(replicate(sample(1:6, 3, repl = FALSE, prob = p), n = n))/n))
1 2 3 4 5 6
1.00000 0.39992 0.39886 0.40088 0.39711 0.40323 # Benchmark
user system elapsed
1.90 0.00 2.03
system.time(print(table(replicate(sample.int.rej(2*3, 3, p), n = n))/n))
1 2 3 4 5 6
1.00000 0.40007 0.40099 0.39962 0.40153 0.39779
user system elapsed
76.02 0.03 77.49 # Slow
system.time(print(table(replicate(weighted_Random_Sample(1:6, p, 3), n = n))/n))
1 2 3 4 5 6
1.00000 0.49535 0.41484 0.36432 0.36338 0.36211 # Incorrect
user system elapsed
3.64 0.01 3.67
system.time(print(table(replicate(fun(1:6, 3, p), n = n))/n))
1 2 3 4 5 6
1.00000 0.39876 0.40031 0.40219 0.40039 0.39835
user system elapsed
4.41 0.02 4.47
请注意这里的一些事情。出于某种原因,weighted_Random_Sample 返回了不正确的值(我根本没有研究过它,但假设分布均匀,它可以正常工作)。 sample.int.rej 重复采样很慢。
总之,Rcpp 似乎是重复采样情况下的最佳选择,而sample.int.rej 在其他情况下速度更快,也更易于使用。