【问题标题】：pixel distance on trees树上的像素距离
【发布时间】：2015-04-02 04:42:13
【问题描述】：

这是我的树。第一列是分支的标识符，其中0 是主干，L 是左侧的第一个分支，R 是右侧的第一个分支。 LL 是第二次分叉后最左边的分支，等等。变量length 包含每个分支的长度。

> tree
  branch length
1      0     20
2      L     12
3     LL     19
4      R     19
5     RL     12
6    RLL     10
7    RLR     12
8     RR     17

这是这棵树的图画

这是这棵树上的两个位置

pos1 = tree[3,]; pos1$length = 12
pos2 = tree[6,]; pos2$length = 3

我构建了这个算法来计算树上任意两点之间沿着树枝的最短距离。

distance = function(tree, pos1, pos2){
    if (identical(pos1$branch, pos2$branch)){Dist=pos1$length-pos2$length;return(Dist)}
    pos1path = strsplit(pos1$branch, "")[[1]]
    if (pos1path[1]!="0") {pos1path = c("0", pos1path)}
    pos2path = strsplit(pos2$branch, "")[[1]]
    if (pos2path[1]!="0") {pos2path = c("0", pos1path)}
    CommonTrace="included"; for (i in 1:min(length(pos1path), length(pos2path))) {if (pos1path[i] != pos2path[i]) {CommonTrace = i-1; break}}

    if(CommonTrace=="included"){
        CommonTrace = min(length(pos1path), length(pos2path))
        if (length(pos1path) > length(pos2path)) {longerpos = pos1; shorterpos = pos2; longerpospath = pos1path} else {longerpos = pos2; shorterpos = pos1; longerpospath = pos2path}
        distToNode = 0
        if ((CommonTrace+1) != length(longerpospath)){
            for (i in (CommonTrace+1):(length(longerpospath)-1)){ distToNode = distToNode + tree$length[tree$branch == paste(longerpospath[2:i], collapse='')]} 
        }
        Dist = distToNode + longerpos$length + (tree[tree$branch == shorterpos$branch,]$length-shorterpos$length)
        if (identical(shorterpos, pos1)){Dist=-Dist}
        return(Dist)
    } else { # if they are sisterbranch
        Dist=0 
        if((CommonTrace+1) != length(pos1path)){
            for (i in (CommonTrace+1):(length(pos1path)-1)){ Dist = Dist + tree$length[tree$branch == paste(pos1path[2:i], collapse='')]}   
        }
        if((CommonTrace+1) != length(pos2path)){
            for (i in (CommonTrace+1):(length(pos2path)-1)){ Dist = Dist + tree$length[tree$branch == paste(pos2path[2:i], collapse='')]}
        }
        Dist = Dist + pos1$length + pos2$length # signdistance does not apply!
        return(Dist)
    }
}

我认为该算法运行良好。然后，我将遍历所有感兴趣的位置。

for (i in allpositions){
   for (j in allpositions){
      mat[i,j] = distance(tree, i, j)
   }
}

问题是我有大约 50000 个位置的非常大的树，我想计算任意两个位置之间的距离，即我要计算数倍于 50000^2 的距离。它需要永远！你能帮我改进我的代码吗？

【问题讨论】：

你能展示你到目前为止所做的事情吗？
你可能做过，但我需要问一下......你确定你已经探索了可能有助于做到这一点的包的所有可能性吗？我在想也许是 igraph 或类似的东西？

标签： r performance tree

【解决方案1】：

这是一个临时答案，旨在帮助 OP 识别算法中的问题。

我在每个循环之后添加了cats；运行代码并查看新创建的tree_cat.txt 文件，它会提示您可能出现的问题。 m 矩阵中的单个单元格（例如m[1, 1]）被多次写入和写入。所以要检查索引。

好消息是矩阵单元中有 121*121 = 14641 次写入操作。所以问题实际上在于分配新矩阵值时使用的索引。

tree <- read.table(text="branch length
1      0     20
2      L     12
3     LL     19
4      R     19
5     RL     12
6    RLL     10
7    RLR     12
8     RR     17", header=TRUE)

m = matrix(0, ncol=sum(tree$length), nrow=sum(tree$length))
catn <- function(...) cat(..., "\n")
capture.output(
for (originbranch in 1:nrow(tree)) {
    catn("originbranch = ", originbranch)
    for (originpatch in 1:tree$length[originbranch]) {
        catn("  originpatch = ", originpatch)
        for (destinationbranch in 1:nrow(tree)) {
            catn("    destinationbranch = ", destinationbranch)
            for (destinationpatch in 1:tree$length[destinationbranch]){
                catn("      destinationpatch = ", destinationpatch)
                split_dest = unlist(strsplit(tree$branch[destinationbranch], ""))
                split_orig =  unlist(strsplit(tree$branch[originbranch], ""))
                depth = 0
                for (i in 1:min(c(length(split_orig), length(split_dest)))) {
                    catn("        i = ", i)
                    if (split_dest[i] == split_orig[i]){
                        depth = depth + 1
                    } else {
                        break
                    }
                }
                distdest = 0
                distorig = 0
                for (upperbranch in depth:length(split_orig)){
                    catn("        upperbranch_orig = ", upperbranch)
                    distorig = distorig + tree$length[tree$branch == paste(split_orig[1:upperbranch], collapse="")]
                }
                for (upperbranch in depth:length(split_dest)){
                    catn("        upperbranch_dest = ", upperbranch)
                    distdest = distdest + tree$length[tree$branch == paste(split_dest[1:upperbranch], collapse="")]
                }
                distorig = distorig + destinationpatch - tree$length[originbranch]
                distdest = distdest + destinationpatch - tree$length[destinationbranch]
                dist = distorig + distdest
                m[originpatch, destinationpatch] = dist ## PROBLEMATIC INDEXING!!
                catn(sprintf("        ----->   Matrix element written: m[%d, %d] = %d", originpatch, destinationpatch, dist))
            }
        }
    }
}, file = "tree_cat.txt")

【讨论】：

【解决方案2】：

我对你的像素距离的概念不是很清楚，但根据我的理解，下面的代码提供了一个函数 pixel_dist，它计算沿树枝指定的两个像素点之间的像素距离。

我使用 igraph 将您的树映射到一个图，其中分支是图边，图顶点是分支交点，并使用图函数进行基本的顶点距离计算。

library(igraph)
#  Assign vertex name to tree branch intersections
temp <- gsub("R","1", gsub("L","0",tree$branch))
temp <- strsplit(temp,split=character(0))
tree$upper_vert <- sapply(temp, function(x) {n <- length(x);  2^n + 2^((n-1):0)%*%as.numeric(x) }  )
tree$lower_vert <- as.integer(tree$upper_vert/2)
tree$branch[tree$branch=="0"] <- "trunk"
tree[tree$branch=="trunk",c("lower_vert","upper_vert")] <- c(0,1)

#  Create graph of tree
tree_graph <- graph.data.frame(tree[,c("lower_vert","upper_vert")], directed=TRUE)    # CORRECTED
E(tree_graph)$label <- paste(tree$branch, tree$length,sep="-")
E(tree_graph)$branch <- tree$branch
E(tree_graph)$length <- tree$length
E(tree_graph)$weight <- tree$length
#
#  assign x & y positions for plotting
#
V(tree_graph)$y <- as.integer(as.numeric(V(tree_graph)$name)^.5) + 1
V(tree_graph)["0"]$y <- 0
V(tree_graph)["1"]$y <- 1
V(tree_graph)$x <- as.numeric(V(tree_graph)$name) - 3*(2^(V(tree_graph)$y-2)) + .5
V(tree_graph)["0"]$x <- 0
V(tree_graph)["1"]$x <- 0
plot(tree_graph)
#
#  calculate distances between vertices
#
vert_dist <- shortest.paths(tree_graph, weights=V(tree_graph)$length, mode="all")  # distances between vertices
vert_dist_dir <- shortest.paths(tree_graph, weights=V(tree_graph)$length, mode="in")  # distances between vertices along directed edges ADDED
#
# Calculate distances from end vertex of each edge (branch)
#
edge_node <- get.edges(tree_graph, E(tree_graph))    #  list of vertices for each edge
brnch_dist <- sapply(edge_node[,2], function(x) vert_dist[x, edge_node[,2]])  # distance between end vertex of each edge
colnames(brnch_dist) <- E(tree_graph)$branch   
rownames(brnch_dist) <- E(tree_graph)$branch

brnch_dist_dir <- sapply(edge_node[,2], function(x) vert_dist_dir[x, edge_node[,2]])  # directed distance between end vertex of each edge - ADDED
colnames(brnch_dist_dir) <- E(tree_graph)$branch   
rownames(brnch_dist_dir) <- E(tree_graph)$branch
#
# calcuates total pixel distance given branches and pixel distances along branch  # CORRECTED
#
pixel_dist <- function(b1, pix1, b2, pix2, brnch_dist, brnch_dist_dir) { 
    if(!is.infinite(brnch_dist_dir[b1,b2]) )     #  directed edges same from b1 to b2
      pixel_dist <- brnch_dist[b1,b2] - E(tree_graph)[branch== b2]$length + E(tree_graph)[branch== b1]$length + pix2 - pix1 
    else {
      if(!is.infinite(brnch_dist_dir[b2,b1]) )   # directed edges same from b2 to b1
         pixel_dist <- brnch_dist[b1,b2] + E(tree_graph)[branch== b2]$length - E(tree_graph)[branch== b1]$length + pix2 - pix1 
    else                                         # opposing directed edges
        pixel_dist <- brnch_dist[b1,b2] - E(tree_graph)[branch== b2]$length - E(tree_graph)[branch== b1]$length + pix2 + pix1
    }
    return(pixel_dist)
}

pixel_dist(b1="L",pix1=3, b2="R", pix2=5, brnch_dist=brnch_dist, brnch_dist_dir=brnch_dist_dir)

带有分支名称、长度和方向的树形图

我不清楚您打算如何将像素距离放置在矩阵中，但您可以使用 pixel_dist 函数或类似的函数以及之前的代码来计算矩阵值。

编辑

上面的代码已被修改，以在计算像素距离时正确考虑边缘方向。

【讨论】：

如果我的问题不清楚，我很抱歉。我会尝试对其进行编辑以改进它。我还没有查看代码，但我希望命令 pixel_dist(b1="L",pix1=3, b2="R", pix2=5, brnch_dist=brnch_dist) 返回 8 而不是 22。它应该返回 8，因为从 pixel2 到分叉点的距离为 5，从 pixel1 到分叉点的距离为 3。谢谢
您的评论和示例有助于阐明我对像素距离的理解。我尝试更正代码以正确考虑边缘方向，并检查了包括您的示例在内的许多案例。它似乎适用于仅向上、仅向下和向上和向下方向遍历树枝的情况。