【问题标题】:Does Column of Sentences Contain Word in Another Column of Sentences?句子列是否包含另一列句子中的单词?
【发布时间】:2021-07-16 17:12:48
【问题描述】:

我有两个大表,每个表都包含一个“句子”列和一串单词。我很好奇哪些记录(真/假输出)的单词在任一列的任何句子中都可以找到。我的表非常大,下面的代码可能需要很长时间。有没有更快的方法来做到这一点?

谢谢!

# Determine if any "words" in either column of sentences match.

# Packages
library(tidyverse)

# Help functions
helper_in_2 <- function(b, a){
  return(any(b %in% a))
}
helper_in <- function(a, b){
  return(lapply(b, helper_in_2, a))
}

# Sample columns 
sentence_col_a <- c("This is an example sentence.", "Here is another sample sentence?", "One more sentence that is not complicated.", "Last sentence to show an example!")
sentence_col_b <- c("Short string A.", "Another longer string.", "Final string example!")

# Extract words from each column
list_col_a <- str_to_lower(sentence_col_a) %>%
  str_extract_all("[:alpha:]+")
list_col_b <- str_to_lower(sentence_col_b) %>%
  str_extract_all("[:alpha:]+")

# Check for matches.
# (Code after first line isn't actually used in my code - it's just to show matches)
sapply(lapply(list_col_a, helper_in, list_col_b), as.numeric) %>%
  t() %>%
  as.data.frame() %>%
  rename_at(vars(names(.)), function(x) sentence_col_b) %>%
  mutate(rownames = sentence_col_a) %>%
  tibble::column_to_rownames(var = "rownames")

输出:

Sentences Short string A. Another longer string. Final string example!
This is an example sentence. 0 0 1
Here is another sample sentence? 0 1 0
One more sentence that is not complicated. 0 0 0
Last sentence to show an example! 0 0 1

Ronak 回答后更新

library(microbenchmark)
microbenchmark("Original method:" = sapply(lapply(list_col_a, helper_in, list_col_b), as.numeric),
               "Ronak's method:" = sapply(list_col_a, function(x) as.integer(grepl(sprintf('\\b(%s)\\b', paste0(x, collapse = '|')), list_col_b))))
#Unit: microseconds
#            expr   min     lq    mean median    uq    max neval
#Original method:  72.9  76.65  88.082  82.35  86.1  173.9   100
# Ronak's method: 262.1 277.40 354.741 286.40 348.6 3724.3   100

【问题讨论】:

    标签: r string performance


    【解决方案1】:

    这里我可以提供几个选项,但是嵌套的for-loop 方法可能是迄今为止最有效的方法:

    • outer
    TIC1 <- function() {
      +outer(list_col_a, list_col_b, FUN = Vectorize(function(x, y) any(x %in% y)))
    }
    
    • 嵌套sapply
    TIC2 <- function() {
      sapply(
        list_col_b,
        function(x) {
          sapply(
            list_col_a,
            function(y) sum(y %in% x)
          )
        }
      )
    }
    
    • 嵌套for循环
    TIC3 <- function() {
      res <- matrix(nrow = length(list_col_a), ncol = length(list_col_b))
      for (a in seq_along(list_col_a)) {
        for (b in seq_along(list_col_b)) {
          res[a, b] <- any(list_col_a[[a]] %in% list_col_b[[b]])
        }
      }
      +res
    }
    

    基准测试

    # Original solution
    original <- function() {
      sapply(lapply(list_col_a, helper_in, list_col_b), as.numeric) %>%
        t() %>%
        as.data.frame() %>%
        rename_at(vars(names(.)), function(x) sentence_col_b) %>%
        mutate(rownames = sentence_col_a) %>%
        tibble::column_to_rownames(var = "rownames")
    }
    
    # Waldi's data.table solution
    Waldi <- function() {
      la <- data.table(id = 1:length(list_col_a), list_col_a)
      lb <- data.table(id = 1:length(list_col_b), list_col_b)
    
      la_long <- la[, .(words = unlist(list_col_a)), by = id]
      lb_long <- lb[, .(words = unlist(list_col_b)), by = id]
      unique(la_long[lb_long, on = .(words = words)][!is.na(id), .(idxa = id, idxb = i.id)])
    }
    
    mustafaakben1 <- function(rows = list_col_a, cols = list_col_b) {
      to_matrix <- function(X_t) {
        matrix(unlist(X_t),
          nrow = length(list_col_a),
          ncol = length(list_col_b),
          byrow = T
        )
      }
      to_matrix(lapply(
        1:length(cols),
        FUN = function(X) {
          lapply(
            X = 1:length(rows),
            FUN = function(Y) {
              sum(rows[[Y]] %in% cols[[X]])
            }
          )
        }
      ))
    }
    
    library(fastmatch)
    mustafaakben2 <- function() {
      search_keywords <- unlist(list_col_b)[unlist(list_col_b) %in% unlist(list_col_a)]
      b_col_filter <- which(unlist(lapply(list_col_b, function(X) any(X %in% search_keywords))))
      a_row_filter <- which(unlist(lapply(list_col_a, function(X) any(X %in% search_keywords))))
      res <- matrix(0,
        nrow = length(list_col_a),
        ncol = length(list_col_b)
      )
      for (a in a_row_filter) {
        for (b in b_col_filter) {
          res[a, b] <- any(list_col_a[[a]] %fin% list_col_b[[b]])
        }
      }
      +res
    }
    
    
    # ThomasIsCoding's outer solution
    TIC1 <- function() {
      +outer(list_col_a, list_col_b, FUN = Vectorize(function(x, y) any(x %in% y)))
    }
    
    TIC2 <- function() {
      sapply(
        list_col_b,
        function(x) {
          sapply(
            list_col_a,
            function(y) sum(y %in% x)
          )
        }
      )
    }
    
    TIC3 <- function() {
      res <- matrix(nrow = length(list_col_a), ncol = length(list_col_b))
      for (a in seq_along(list_col_a)) {
        for (b in seq_along(list_col_b)) {
          res[a, b] <- any(list_col_a[[a]] %in% list_col_b[[b]])
        }
      }
      +res
    }
    
    microbenchmark::microbenchmark(
      original(),
      Waldi(),
      mustafaakben1(),
      mustafaakben2(),
      TIC1(),
      TIC2(),
      TIC3(),
      unit = "relative"
    )
    

    你会看到

    Unit: relative
                expr        min         lq      mean     median         uq      max
          original() 172.895884 149.346066 49.841448 142.676077 134.111459 3.130206
             Waldi() 107.441122  92.004380 30.290680  88.474026  83.690267 1.971249
     mustafaakben1()   1.596981   1.551978  1.646884   1.610160   1.553021 1.683034
     mustafaakben2()   1.635812   1.731991  2.186106   1.912535   1.831179 2.332797
              TIC1()   3.854043   3.845866  1.977066   3.943445   3.707308 1.041416
              TIC2()   2.888118   2.627955  1.607401   2.719427   2.538536 1.142211
              TIC3()   1.000000   1.000000  1.000000   1.000000   1.000000 1.000000
     neval
       100
       100
       100
       100
       100
       100
       100
    

    【讨论】:

      【解决方案2】:

      尝试嵌套 lappy

      
      to_matrix <- function(X_t){
        matrix(unlist(X_t),
                 nrow = length(list_col_a),
                 ncol = length(list_col_b)),
                 byrow = T)
      }
      
      
      
      nested_lappy <- function(rows=list_col_a, cols=list_col_b) {
        to_matrix(lapply(
          1:length(cols),
          FUN = function (X)
            lapply(
              X = 1:length(rows),
              FUN = function(Y)
                sum(rows[[Y]] %in% cols[[X]])
            )
        ))
      }
      
      
      > nested_lappy()
           [,1] [,2] [,3]
      [1,]    0    0    1
      [2,]    0    1    0
      [3,]    0    0    0
      [4,]    0    0    1
      
      

      您的矩阵可能也会很大。最好使用稀疏矩阵。您可以尝试使用Matrix 包。它可以帮助您以更节省内存的方式进行分析。

      这是基准

      microbenchmark::microbenchmark(
        original(),
        Waldi(),
        TIC(),
        nested_lappy(),
        unit = "relative"
      )
      
      Unit: relative
                 expr      min        lq      mean   median        uq        max neval
           original() 99.97881 89.869163 83.011249 67.88434 69.883301 260.704657   100
              Waldi() 56.55076 51.185905 45.436361 39.35327 42.730942  46.438114   100
                TIC()  2.27000  2.249311  1.986625  1.84108  1.837013   3.974149   100
       nested_lappy()  1.00000  1.000000  1.000000  1.00000  1.000000   1.000000   100
      
      
      

      编辑

      我会在这里作弊,因为@ThomasIsCoding 是一个了不起的编码器。我需要作弊:)

      因此,由于您有一个巨大的表格,您需要专注于一种有效的方式来搜索您的关键字空间。您可能会注意到,并非所有关键字都有交集并且在句子中统一共享。因此,即使在开始搜索之前,您也可以消除搜索空间中的那些句子。通过这样做,我们可以只关注列维度和行维度共享的单词。

      search_keywords<- unlist(list_col_b)[unlist(list_col_b) %in% unlist(list_col_a)]
      b_col_filter <- which(unlist(lapply(list_col_b, function(X) any(X %in% search_keywords))))
      a_row_filter <- which(unlist(lapply(list_col_a, function(X) any(X %in% search_keywords))))
      

      然后,使用fastmatch 包使%in% 更快。

      
      library(fastmatch)
      
      mustafaakben2 <- function() {
        res <- matrix(0,
                      nrow = length(list_col_a),
                      ncol = length(list_col_b))
        for (a in a_row_filter) {
          for (b in b_col_filter) {
            res[a, b] <- any(list_col_a[[a]] %fin% list_col_b[[b]])
            
          }
        }
        +res
      }
      
      > mustafaakben2()
           [,1] [,2] [,3]
      [1,]    0    0    1
      [2,]    0    1    0
      [3,]    0    0    0
      [4,]    0    0    1
      
      
      

      基准测试结果

      microbenchmark::microbenchmark(
        original(),
        Waldi(),
        TIC1(),
        TIC2(),
        TIC3(),
        mustafaakben(),
        mustafaakben2(),
        unit = "relative"
      )
      
      Unit: relative
                  expr        min         lq       mean     median         uq         max neval cld
            original() 288.620155 254.429012 193.446439 190.457965 171.914286 115.0415822   100   c
               Waldi() 182.751938 153.864198 115.182908 115.778761 103.518095  36.9411765   100  b 
                TIC1()   6.581395   6.277778   5.074523   5.066372   4.685714   2.3732252   100 a  
                TIC2()   4.705426   4.385802   3.503269   3.466814   3.281905   1.5811359   100 a  
                TIC3()   1.767442   1.685185   1.360847   1.338496   1.249524   0.7728195   100 a  
        mustafaakben()   2.589147   2.330247   1.944260   2.017699   1.864762   0.7322515   100 a  
       mustafaakben2()   1.000000   1.000000   1.000000   1.000000   1.000000   1.0000000   100 a  
      
      

      【讨论】:

      • 不错的“作弊”。缩小关键字的搜索空间是个好主意。还有一件事。我认为您应该将构建b_col_filtera_col_filter 的步骤放在您的函数mustafaakben2 中并查看性能,这可以在具有相同输入参数的不同选项之间提供公平的基准测试。
      • 也许你可以看到我更新的基准测试结果。
      • 是的,你是对的!好工作!真的很有趣。
      【解决方案3】:

      您可以使用data.table 连接来获取匹配句子的ID。

      library(data.table)
      
      # Original solution
      original <- function(){
      
      sapply(lapply(list_col_a, helper_in, list_col_b), as.numeric) %>%
        t() %>%
        as.data.frame() %>%
        rename_at(vars(names(.)), function(x) sentence_col_b) %>%
        mutate(rownames = sentence_col_a) %>%
        tibble::column_to_rownames(var = "rownames")
      
      }
      
      # data.table solution
      new <- function(){
      
      la <- data.table(id = 1:length(list_col_a),list_col_a)
      lb <- data.table(id = 1:length(list_col_b),list_col_b)
      
      la_long <- la[,.(words=unlist(list_col_a)),by= id]
      lb_long <- lb[,.(words=unlist(list_col_b)),by= id]
      unique(la_long[lb_long, on=.(words=words)][!is.na(id),.(idxa=id, idxb = i.id)])
      
      }
      
      
      new()
         idxa idxb
      1:    2    2
      2:    1    3
      3:    4    3
      
      microbenchmark::microbenchmark(original(),new())
      
      Unit: milliseconds
             expr    min     lq     mean median      uq     max neval cld
       original() 4.1623 5.1190 5.857155 5.5528 6.18345 23.5442   100   b
            new() 2.2492 2.7993 3.255741 3.1298 3.68645  5.1872   100  a 
      

      由于data.table 允许索引,这对于更多数量的句子/单词可能会更有效:在更大的数据集上进行测试。

      【讨论】:

        【解决方案4】:

        借助正则表达式,您可以通过一个sapply 调用来完成此操作。我们用list_col_a 中的每个值创建一个模式,并检查list_col_b 中是否存在任何值。

        sapply(list_col_a, function(x) as.integer(grepl(sprintf('\\b(%s)\\b', 
               paste0(x, collapse = '|')), list_col_b)))
        
        #     [,1] [,2] [,3] [,4]
        #[1,]    0    0    0    0
        #[2,]    0    1    0    0
        #[3,]    1    0    0    1
        

        您可以按原样包含剩余代码以获取匹配项。

        【讨论】:

        • 您好 Ronak,感谢您的回复。您的代码当然更简洁,但使用微基准测试时需要更长的时间,这就是我正在寻找不同方法的原因。请参考我的编辑以查看时间。