在R中的两个数据表之间按行计算匹配元素答案

【问题标题】：Count matching elements by row between two data tables in R在R中的两个数据表之间按行计算匹配元素
【发布时间】：2023-03-17 00:33:01
【问题描述】：

我在 R 中有两个数据框，我需要逐行计算元素匹配项，最后得到一列，其中包含两个表的笛卡尔积的长度和两行的 ID。此外，表格很大，行数不同，但列数相同。

我有以下代码，但是多次运行时速度很慢。

library(data.table)

table_1<-data.table(matrix(c(1:24),nrow = 4))
table_2<-data.table(matrix(c(11:34),nrow = 4))

names(table_1)<-c("s1", "s2","s3","s4","s5","s6")
names(table_2)<-c("a1","a2","a3","a4","a5","a6")

table_1$ID<-seq.int(nrow(table_1))
table_2$ID_ap<-seq.int(nrow(table_2))

setcolorder(table_1, c("ID", "s1", "s2","s3","s4","s5","s6"))
setcolorder(table_2, c("ID_ap","a1","a2","a3","a4","a5","a6"))

CJ.table<-function(X,Y) setkey(X[,c(k=1,.SD)],k)[Y[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL] 

join<-CJ.table(table_1,table_2)

R<-subset(join, select=c("ID_ap","ID"))

R$Ac<- (join$s1 == join$a1) + (join$s1 ==join$a2) + (join$s1 ==join$a3) + (join$s1 ==join$a4) + (join$s1 ==join$a5) + (join$s1 ==join$a6)+ 
(join$s2 == join$a1) + (join$s2 ==join$a2) + (join$s2 ==join$a3) + (join$s2 ==join$a4) + (join$s2 ==join$a5) + (join$s2 ==join$a6)+ 
(join$s3 == join$a1) + (join$s3 ==join$a2) + (join$s3 ==join$a3) + (join$s3 ==join$a4) + (join$s3 ==join$a5) + (join$s3 ==join$a6)+ 
(join$s4 == join$a1) + (join$s4 ==join$a2) + (join$s4 ==join$a3) + (join$s4 ==join$a4) + (join$s4 ==join$a5) + (join$s4 ==join$a6)+ 
(join$s5 == join$a1) + (join$s5 ==join$a2) + (join$s5 ==join$a3) + (join$s5 ==join$a4) + (join$s5 ==join$a5) + (join$s5 ==join$a6)+ 
(join$s6 == join$a1) + (join$s6 ==join$a2) + (join$s6 ==join$a3) + (join$s6 ==join$a4) + (join$s6 ==join$a5) + (join$s6 ==join$a6)

给了

   R
   ID_ap ID Ac
 1:     1  1  0
 2:     1  2  0
 3:     1  3  4
 4:     1  4  0
 5:     2  1  0
 6:     2  2  0
 7:     2  3  0
 8:     2  4  4
 9:     3  1  3
10:     3  2  0
11:     3  3  0
12:     3  4  0
13:     4  1  0
14:     4  2  3
15:     4  3  0
16:     4  4  0

【问题讨论】：

你的“data.frame”的维度是什么？它们包含什么值？
大约有 10k 行和 100 行矩阵，填充了小的非零正整数。
在一行中，值总是不同的？
是的，行中的值总是不同的，而矩阵中的行总是不同的。但是在两个矩阵之间可能有相等的行@Frank
关于您的 CJ.table，您可能对这个问题感兴趣：stackoverflow.com/q/25888706

标签： r performance optimization data.table match

【解决方案1】：

将数据以长格式放置，因为列顺序无关紧要：

setnames(table_2, "ID_ap", "ID")
tabs = rbind(
  melt(table_1, id="ID")[, variable := NULL],
  melt(table_2, id="ID")[, variable := NULL],
  idcol = TRUE)

(1) 对于每个值，识别相关对；和

(2) 对于对，计数值：

tabs[, 
  if (uniqueN(.id) > 1L) CJ(ID1 = ID[.id == 1L], ID2 = ID[.id == 2L])
, by=value][,
   .N
, by=.(ID1, ID2)]


   ID1 ID2 N
1:   3   1 4
2:   4   2 4
3:   1   3 3
4:   2   4 3

我认为，所有其他 (ID1, ID2) 组合都是零，不需要明确列举。

如果值在每个表中不同，就像在 OP 的示例中一样，那么我们可以简化：

tabs[, if (.N==2L) .(ID1 = ID[1L], ID2 = ID[2L]), by=value][, .N, by=.(ID1, ID2)]

【讨论】：

如果 OP 需要，一个简单的连接可以重建零
很好，这击败了（已经有点快的）OP 的代码——我查看了我的答案中的时间安排（其中包括一个缓慢的基本 R 解决方案）：stackoverflow.com/a/36558985/4598520

【解决方案2】：

假设两个表的行数和唯一值个数的乘积都不大：

x1 = unlist(table_1, FALSE, FALSE)
x2 = unlist(table_2, FALSE, FALSE)

具有共同的唯一值：

lvs = union(x1, x2)

以及tabulate每个表的每一行中每个唯一值的出现：

tab1 = matrix(tabulate(seq_len(nrow(table_1)) + (match(x1, lvs) - 1L) * nrow(table_1), 
                       nrow(table_1) * length(lvs)), 
              nrow(table_1), length(lvs))
tab2 = matrix(tabulate(seq_len(nrow(table_2)) + (match(x2, lvs) - 1L) * nrow(table_2), 
                       nrow(table_2) * length(lvs)), 
              nrow(table_2), length(lvs))

终于：

tcrossprod(tab1, tab2) #or 'tcrossprod(tab1 > 0L, tab2 > 0L)' to not count duplicate matches
#     [,1] [,2] [,3] [,4]
#[1,]    0    0    3    0
#[2,]    0    0    0    3
#[3,]    4    0    0    0
#[4,]    0    4    0    0

#and to change format (among different ways):
ans = tcrossprod(tab1, tab2)
cbind(c(row(ans)), c(col(ans)), c(ans))

如果tab1 和tab2 非常大，可以将它们构建为稀疏矩阵，一种方法可以是：

library(Matrix)
stab1 = xtabs(rep_len(1L, length(x1)) ~ 
                    rep_len(seq_len(nrow(table_1)), length(x1)) 
                    + factor(match(x1, lvs), lvs), 
              sparse = TRUE)
stab2 = xtabs(rep_len(1L, length(x2)) ~ 
                    rep_len(seq_len(nrow(table_2)), length(x2)) 
                    + factor(match(x2, lvs), lvs), 
              sparse = TRUE)
tcrossprod(stab1, stab2)
#4 x 4 sparse Matrix of class "dgCMatrix"
#  1 2 3 4
#1 . . 3 .
#2 . . . 3
#3 4 . . .
#4 . 4 . .

编辑

每行有 (1) 个小的正整数值和 (2) 个不同的值，使用 match/unique/unique/union 创建查找并可以避免制表：

x1 = unlist(table_1, FALSE, FALSE)
x2 = unlist(table_2, FALSE, FALSE)
nlvs = max(max(x1), max(x2))
stab1 = sparseMatrix(i = rep_len(seq_len(nrow(table_1)), length(x1)), 
                     j = x1, 
                     x = 1L, 
                     dims = c(nrow(table_1), nlvs))
stab2 = sparseMatrix(i = rep_len(seq_len(nrow(table_2)), length(x2)), 
                     j = x2, 
                     x = 1L, 
                     dims = c(nrow(table_2), nlvs))
tcrossprod(stab1, stab2)
#4 x 4 sparse Matrix of class "dgCMatrix"
#            
#[1,] . . 3 .
#[2,] . . . 3
#[3,] 4 . . .
#[4,] . 4 . .

summary(tcrossprod(stab1, stab2))
#4 x 4 sparse Matrix of class "dgCMatrix", with 4 entries 
#  i j x
#1 3 1 4
#2 4 2 4
#3 1 3 3
#4 2 4 3

【讨论】：

【解决方案3】：

怎么样：

colSums(apply(join[, !c("ID", "ID_ap"), with = F], 1, duplicated))
#[1] 0 0 4 0 0 0 0 4 3 0 0 0 0 3 0 0

或者，从头开始：

setkey(table_1, ID)
setkey(table_2, ID_ap)

ids = CJ(ID1 = table_1$ID, ID2 = table_2$ID_ap)
ids[, sum(duplicated(c(table_1[.(ID1), !'ID', with = F],
                       table_2[.(ID2), !'ID_ap', with = F])))
    , by = .(ID1, ID2)]
#    ID1 ID2 V1
# 1:   1   1  0
# 2:   1   2  0
# 3:   1   3  3
# 4:   1   4  0
# 5:   2   1  0
# 6:   2   2  0
# 7:   2   3  0
# 8:   2   4  3
# 9:   3   1  4
#10:   3   2  0
#11:   3   3  0
#12:   3   4  0
#13:   4   1  0
#14:   4   2  4
#15:   4   3  0
#16:   4   4  0

【讨论】：

【解决方案4】：

帖子中没有明确说明性能要求。但是，我已经创建了一个更大版本的可重现示例（如下），并且问题中的代码已经非常快了。

下面是如何在基础 R 中做到这一点，这是很好的衡量标准：

t1 <- as.data.frame(table_1)
t2 <- as.data.frame(table_2)

system.time({
  ## compute all combinations of indices
  indices <- merge(t1[1], t2[1])

  ## create a mega df including all rows, cbinded together
  rows <- cbind(t1[indices[ ,"ID"], -1], t2[indices[ , "ID_ap"], -1])

  t1_cols <- names(rows) %in% names(t1)
  t2_cols <- names(rows) %in% names(t2)

  ## compute the counts; this step takes most of the time
  ## ~ 14 of the 18 second in this example
  counts <- apply(rows, 1, function(r) sum(r[t1_cols] %in% r[t2_cols]))
})
out <- data.frame(indices, Ac=counts)

例如，对于来自下面的大型可重现问题 (dim(out) == c(1e6, 3))，上面的代码运行时间不到 20 秒。

   user  system elapsed
 17.879   0.348  18.245

编辑大型可重现问题：

library(data.table)
NROW <- 1e4
NROW2 <- 1e2
table_1<-data.table(matrix(c(1:24),nrow = NROW, ncol=6))
table_2<-data.table(matrix(c(11:34),nrow = NROW2, ncol=6))

names(table_1)<-c("s1", "s2","s3","s4","s5","s6")
names(table_2)<-c("a1","a2","a3","a4","a5","a6")

table_1$ID<-seq.int(nrow(table_1))
table_2$ID_ap<-seq.int(nrow(table_2))

setcolorder(table_1, c("ID", "s1", "s2","s3","s4","s5","s6"))
setcolorder(table_2, c("ID_ap","a1","a2","a3","a4","a5","a6"))

OP 的解决方案运行速度比这个答案快得多

CJ.table<-function(X,Y) setkey(X[,c(k=1,.SD)],k)[Y[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL] 

join<-CJ.table(table_1,table_2)

R<-subset(join, select=c("ID_ap","ID"))

system.time({
   R$Ac<- (join$s1 == join$a1) + (join$s1 ==join$a2) + (join$s1 ==join$a3) + (join$s1 ==join$a4) + (join$s1 ==join$a5) + (join$s1 ==join$a6)+
  (join$s2 == join$a1) + (join$s2 ==join$a2) + (join$s2 ==join$a3) + (join$s2 ==join$a4) + (join$s2 ==join$a5) + (join$s2 ==join$a6)+
  (join$s3 == join$a1) + (join$s3 ==join$a2) + (join$s3 ==join$a3) + (join$s3 ==join$a4) + (join$s3 ==join$a5) + (join$s3 ==join$a6)+
  (join$s4 == join$a1) + (join$s4 ==join$a2) + (join$s4 ==join$a3) + (join$s4 ==join$a4) + (join$s4 ==join$a5) + (join$s4 ==join$a6)+
  (join$s5 == join$a1) + (join$s5 ==join$a2) + (join$s5 ==join$a3) + (join$s5 ==join$a4) + (join$s5 ==join$a5) + (join$s5 ==join$a6)+
  (join$s6 == join$a1) + (join$s6 ==join$a2) + (join$s6 ==join$a3) + (join$s6 ==join$a4) + (join$s6 ==join$a5) + (join$s6 ==join$a6)
})
#    user  system elapsed
# 0.295   0.044   0.339

但弗兰克的答案中的解决方案仍然更快

setnames(table_2, "ID_ap", "ID")
tabs = rbind(
               melt(table_1, id="ID")[, variable := NULL],
                 melt(table_2, id="ID")[, variable := NULL],
                 idcol = TRUE)

system.time({out3 <- tabs[,
        if (uniqueN(.id) > 1L) CJ(ID1 = ID[.id == 1L], ID2 = ID[.id == 2L])
        , by=value][,
           .N
        , by=.(ID1, ID2)]})
#   user  system elapsed
#  0.109   0.013   0.122

【讨论】：

感谢您发布基准。我希望有人会。我很想知道其他人是否也表现良好。很难知道在system.time() 调用中包含/排除什么。我会争辩说我的tabs 创建属于外部（就像你在这里拥有的那样），因为数据应该以这种方式开始存储。但是 OP 对 join 和 R 的创建属于调用内部，因为它们是仅用于此计算的过程的一部分（但它们在此处的调用之外）。不过，这很主观。
谢谢。你的回答给我留下了深刻的印象，现在想学习一些data.table。我也同意主观性。
如果我们将 OP 的 join 和 R 放在 system.time 中，那么用户时间会增加到 0.590。我也试过@eddi 的colSums 行，它几乎和我的一样慢（user = 10.581`）。

【解决方案5】：

这是一种可能性：

> t1<-data.frame(matrix(c(1:24),nrow = 4))
> t2<-data.frame(matrix(c(11:34),nrow = 4))
> ret<-expand.grid(r1=1:nrow(t1),r2=1:nrow(t2))
> ret$matches<-apply(ret,1,function(a)sum(t1[a[1],] %in% t2[a[2],]))
> ret
   r1 r2 matches
1   1  1       0
2   2  1       0
3   3  1       4
4   4  1       0
5   1  2       0
6   2  2       0
7   3  2       0
8   4  2       4
9   1  3       3
10  2  3       0
11  3  3       0
12  4  3       0
13  1  4       0
14  2  4       3
15  3  4       0
16  4  4       0

【讨论】：

感谢@mrip，在小矩阵上工作得很好，但是当使用像 10k 行这样的大矩阵时，它比原始代码花费的时间要长得多，我会尝试更深入地了解它。
如果你有一个大数据集并且需要速度我会建议（1）不要使用data.frame或data.table，而只使用矩阵和（2）使用Rcpp写一个更高效匹配器。