考虑到您的数字,一种好方法是简单地使用替换抽签并丢弃重复抽签。
以下是从不同大小的数组中抽取 5000 个 10% 或 10 个样本的时间,以较小者为准。 pp_overdraw 抽取过多的样本并丢弃,pp_fillin 抽取到准确的数量,重新绘制坏样本直到没有剩余。 pythran 是一个编译的解决方案。由于 OP 要求纯 numpy,因此此处仅供参考。
代码:
import pyflip
import numpy as np
from simple_benchmark import BenchmarkBuilder, MultiArgument
B = BenchmarkBuilder()
@B.add_function()
def pythran(A,k,n):
assert len(A) >= k
if A.dtype not in (float,int) or A.ndim > 1:
return A[pyflip.without_repl(np.arange(len(A)),k,n)]
else:
return pyflip.without_repl(A,k,n)
from scipy import stats
@B.add_function()
def pp_overdraw(A,k,n):
N = len(A)
p = np.linspace(1-(k-1)/N,1-1/N,k-1).prod()
nn = int(n/p + 4*np.sqrt(n-n*p)) + 1
out = np.random.randint(0,N,(nn,k))
os = np.sort(out,1)
valid = (os[:,:-1] != os[:,1:]).all(1)
validx, = valid.nonzero()
while len(validx)<n: # very unlikely
replace = np.random.randint(0,N,(nn-len(validx),k))
rs = np.sort(replace,1)
rvalid = (rs[:,:-1] != rs[:,1:]).all(1)
out[~valid,:] = replace
valid[~valid] = rvalid
validx, = valid.nonzero()
return A[out[validx[:n]]]
@B.add_function()
def pp_fillin(A,k,n):
N = len(A)
out = np.random.randint(0,N,(n,k))
os = np.sort(out,1)
valid = (os[:,:-1] != os[:,1:]).all(1)
validx, = valid.nonzero()
while len(validx)<n:
replace = np.random.randint(0,N,(n-len(validx),k))
rs = np.sort(replace,1)
rvalid = (rs[:,:-1] != rs[:,1:]).all(1)
out[~valid,:] = replace
valid[~valid] = rvalid
validx, = valid.nonzero()
return A[out]
@B.add_function()
def OP(A,k,n):
subsets = np.zeros(n).reshape((n,1))
subsets = np.apply_along_axis(lambda x: np.random.choice(A, k, replace=False),1, subsets)
@B.add_function()
def Sam_Mason(A,k,n):
assert k <= len(A)
idx = np.random.randint(len(A) - np.arange(k), size=[n, k])
for i in range(k-1, 0, -1):
idx[:,i:] += idx[:,i:] >= idx[:,i-1,None]
return np.array(A)[idx]
@B.add_arguments('array size')
def argument_provider():
for exp in range(4, 15):
sz = 2**exp
A = np.arange(sz)
yield sz, MultiArgument([A,min(10,sz//10+1),5000])
r = B.run()
r.plot()
import pylab
pylab.savefig('norepl.png')
pythran 模块。原则上,它在 Python 中运行,但这会非常缓慢。更好地编译
pythran -O3 pyflip.py
文件<pyflip.py>
import numpy as np
#pythran export without_repl(int[:],int,int)
#pythran export without_repl(float[:],int,int)
def without_repl(A,k,n):
N = A.size
out = np.empty((n,k),A.dtype)
A = A.copy()
for i in range(n):
for j in range(k):
l = np.random.randint(j,N)
out[i,j] = A[l]
A[l] = A[j]
A[j] = out[i,j]
return out