【问题标题】：Optimizing speed of nearest search function in R [closed]优化R中最近搜索函数的速度[关闭]
【发布时间】：2016-12-30 00:28:48
【问题描述】：

我正在尝试使此功能更快（最好使用 RcppAmadillo 或其他替代方法）。 myfun 采用一个矩阵 mat，它可以变得非常大，但总是两列。 myfun 为矩阵中的每一行找到与每一行的绝对值相差 +1 或 -1 的最近行

如下面的例子，垫子的第一行是3,3。因此，myfun 将输出一个列表，其中 2 和 3 行最接近第 1 行，而不是第 5 行，距离 +2。

library(microbenchmark)
dim(mat)
[1] 1000    2
head(mat)
      x y
[1,]  3 3
[2,]  3 4
[3,]  3 2
[4,]  7 3
[5,]  4 4
[6,] 10 1

output
[[1]]
[1] 2 3

[[2]]
[1] 1

[[3]]
[1] 1

[[4]]
integer(0)

[[5]]
integer(0)

[[6]]
integer(0)


microbenchmark( myfun(mat), times =  100) #mat of 1000 rows
#    Unit: milliseconds
#             expr      min       lq     mean   median       uq      max neval
#    myfun(mat) 89.30126 90.28618 95.50418 90.91281 91.50875 180.1505   100

microbenchmark( myfun(mat), times =  100) #mat of 10,000 rows
#    Unit: seconds
#             expr      min       lq     mean   median       uq      max neval
#    myfun(layout.old) 5.912633 5.912633 5.912633 5.912633 5.912633 5.912633     1

这就是 myfun 的样子

myfun = function(x){
    doo <- function(j) {
        j.mat <- matrix(rep(j, length = length(x)), ncol = ncol(x), byrow = TRUE)
        j.abs <- abs(j.mat - x)
        return(which(rowSums(j.abs) == 1))
    }
    return(apply(x, 1, doo))
}

【问题讨论】：

很少有关于更快/更高效的distance 计算的帖子可能会有所帮助...stackoverflow.com/questions/16190214/…、stackoverflow.com/questions/13668675/…、stackoverflow.com/questions/17138179/…、r.789695.n4.nabble.com/…。使用哪个取决于您的数据的未说明大小
不应该output[[2]] = 1, 5 和output[[5]] = 2？
酷，您列出了dist 函数的版本！但是，没有尝试将其翻译成 C++。因此，我们无法真正帮助您，因为 StackOverflow 不是免费的代码翻译服务。
删除 rcpp 和 armadillo 标签。这个问题与任何一个都无关，除了 OP 梦想有人为他重写他的代码，正如@coatless 指出的那样。不太可能……

标签： r list matrix

【解决方案1】：

下面，我有一个base R 解决方案，它比OP 提供的myfun 快得多。

myDistOne <- function(m) {
    v1 <- m[,1L]; v2 <- m[,2L]
    rs <- rowSums(m)
    lapply(seq_along(rs), function(x) {
        t1 <- which(abs(rs[x] - rs) == 1)
        t2 <- t1[which(abs(v1[x] - v1[t1]) <= 1)]
        t2[which(abs(v2[x] - v2[t2]) <= 1)]
    })
}

以下是一些基准：

library(microbenchmark)
set.seed(9711)
m1 <- matrix(sample(50, 2000, replace = TRUE), ncol = 2)   ## 1,000 rows
microbenchmark(myfun(m1), myDistOne(m1))
Unit: milliseconds
         expr      min       lq     mean   median       uq      max neval cld
    myfun(m1) 78.61637 78.61637 80.47931 80.47931 82.34225 82.34225     2   b
myDistOne(m1) 27.34810 27.34810 28.18758 28.18758 29.02707 29.02707     2  a 

identical(myfun(m1), myDistOne(m1))
[1] TRUE

m2 <- matrix(sample(200, 20000, replace = TRUE), ncol = 2)  ## 10,000 rows
microbenchmark(myfun(m2), myDistOne(m2))
Unit: seconds
         expr      min       lq     mean   median       uq      max neval cld
    myfun(m2) 5.219318 5.533835 5.758671 5.714263 5.914672 7.290701   100   b
myDistOne(m2) 1.230721 1.366208 1.433403 1.419413 1.473783 1.879530   100  a 

identical(myfun(m2), myDistOne(m2))
[1] TRUE

这是一个非常大的例子：

m3 <- matrix(sample(1000, 100000, replace = TRUE), ncol = 2) ## 50,000 rows
system.time(testJoe <- myDistOne(m3))
   user  system elapsed 
 26.963  10.988  37.973 

system.time(testUser <- myfun(m3))
   user  system elapsed 
148.444  33.297 182.639 

identical(testJoe, testUser)
[1] TRUE

我确信有更快的解决方案。也许通过预先对 rowSums 进行排序并从那里开始工作可以看到改进（它也可能变得非常混乱）。

更新

正如我预测的那样，使用已排序的 rowSums 会更快（而且更丑！）

myDistOneFast <- function(m) {
    v1 <- m[,1L]; v2 <- m[,2L]
    origrs <- rowSums(m)
    mySort <- order(origrs)
    rs <- origrs[mySort]
    myDiff <- c(0L, diff(rs))
    brks <- which(myDiff > 0L)
    lenB <- length(brks)
    n <- nrow(m)
    myL <- vector("list", length = n)

    findRows <- function(v, s, r, u1, u2) {
        lapply(v, function(x) {
            sx <- s[x]
            tv1 <- s[r]
            tv2 <- tv1[which(abs(u1[sx] - u1[tv1]) <= 1)]
            tv2[which(abs(u2[sx] - u2[tv2]) <= 1)]
        })
    }

    t1 <- brks[1L]; t2 <- brks[2L]
    ## setting first index in myL
    myL[mySort[1L:(t1-1L)]] <- findRows(1L:(t1-1L), mySort, t1:(t2-1L), v1, v2)
    k <- t0 <- 1L

    while (k < (lenB-1L)) {
        t1 <- brks[k]; t2 <- brks[k+1L]; t3 <- brks[k+2L]
        vec <- t1:(t2-1L)
        if (myDiff[t1] == 1L) {
            if (myDiff[t2] == 1L) {
                myL[mySort[vec]] <- findRows(vec, mySort, c(t0:(t1-1L), t2:(t3-1L)), v1, v2)
            } else {
                myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
            }
        } else if (myDiff[t2] == 1L) {
            myL[mySort[vec]] <- findRows(vec, mySort, t2:(t3-1L), v1, v2)
        }
        if (myDiff[t2] > 1L) {
            if (myDiff[t3] > 1L) {
                k <- k+2L; t0 <- t2
            } else {
                k <- k+1L; t0 <- t1
            }
        } else {k <- k+1L; t0 <- t1}
    }

    ## setting second to last index in myL
    if (k == lenB-1L) {
        t1 <- brks[k]; t2 <- brks[k+1L]; t3 <- n+1L; vec <- t1:(t2-1L)
        if (myDiff[t1] == 1L) {
            if (myDiff[t2] == 1L) {
                myL[mySort[vec]] <- findRows(vec, mySort, c(t0:(t1-1L), t2:(t3-1L)), v1, v2)
            } else {
                myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
            }
        } else if (myDiff[t2] == 1L) {
            myL[mySort[vec]] <- findRows(vec, mySort, t2:(t3-1L), v1, v2)
        }
        k <- k+1L; t0 <- t1
    }

    t1 <- brks[k]; vec <- t1:n
    if (myDiff[t1] == 1L) {
        myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
    }

    myL
}

结果甚至不接近。 myDistOneFast 在非常大的矩阵上比 OP 的原始 myfun 快 100 倍以上，并且可以很好地扩展。以下是一些基准：

microbenchmark(OP = myfun(m1), Joe = myDistOne(m1), JoeFast = myDistOneFast(m1))
Unit: milliseconds
   expr      min       lq     mean   median       uq       max neval
     OP 57.60683 59.51508 62.91059 60.63064 61.87141 109.39386   100
    Joe 22.00127 23.11457 24.35363 23.87073 24.87484  58.98532   100
JoeFast 11.27834 11.99201 12.59896 12.43352 13.08253  15.35676   100

microbenchmark(OP = myfun(m2), Joe = myDistOne(m2), JoeFast = myDistOneFast(m2))
Unit: milliseconds
   expr       min        lq      mean    median        uq       max neval
     OP 4461.8201 4527.5780 4592.0409 4573.8673 4633.9278 4867.5244   100
    Joe 1287.0222 1316.5586 1339.3653 1331.2534 1352.3134 1524.2521   100
JoeFast  128.4243  134.0409  138.7518  136.3929  141.3046  172.2499   100

system.time(testJoeFast <- myDistOneFast(m3))
user  system elapsed 
0.68    0.00    0.69   ### myfun took over 100s!!!

为了测试相等性，我们必须对每个索引向量进行排序。我们也不能使用identical 进行比较，因为myL 被初始化为一个空列表，因此一些索引包含NULL 值（这些对应于myfun 和myDistOne 的结果中的integer(0) )。

testJoeFast <- lapply(testJoeFast, sort)
all(sapply(1:50000, function(x) all(testJoe[[x]]==testJoeFast[[x]])))
[1] TRUE

unlist(testJoe[which(sapply(testJoeFast, is.null))])
integer(0)

这是一个有 500,000 行的示例：

set.seed(42)
m4 <- matrix(sample(2000, 1000000, replace = TRUE), ncol = 2)
system.time(myDistOneFast(m4))
   user  system elapsed 
  10.84    0.06   10.94

以下是算法工作原理的概述：

计算行和
对 rowSums 排序（即从排序后的向量的原始向量返回索引）
致电diff
标记每个非零实例
确定小范围内的哪些索引满足 OP 的要求
使用2中计算的有序向量确定原始索引

这比每次将一个 rowSum 与所有 rowSum 进行比较要快得多。

【讨论】：