【问题标题】:Finding every pronounciation permutation of a word查找单词的每个发音排列
【发布时间】:2017-03-16 21:30:38
【问题描述】:

我正在尝试编写一个函数,该函数使用发音词典生成单词的所有可能发音排列。

# Dictionary
sounddef <- t(matrix(strsplit('c,k,c,s,ou,uh,n,n,t,t,r,r,y,ee,w,w,o,oh,o,uh,th,th,s,s,m,m',",")[[1]], nrow = 2))

# The first column is the written letter and the second is a possible pronunciation.

match_rec <- function(x, sounddef) {
  if (!nzchar(x)) return("")
  returner <- NULL
  for (i in 1:nrow(sounddef)) {
    v <- sounddef[i,]
    char <- paste0("^",v[1])

    if (grepl(char, x)) 
      returner  <- c(returner, paste0(v[1],'->',v[2], ",", 
                                      match_rec(gsub(char, "", x), sounddef), collapse=""))
  }
  returner
}

# Unfortunately this does not return the right values
match_rec("country", sounddef)
[1] "c->k,ou->uh,n->n,t->t,r->r,y->ee,c->k,o->oh,c->k,o->uh,"
[2] "c->s,ou->uh,n->n,t->t,r->r,y->ee,c->s,o->oh,c->s,o->uh,"

它应该返回的值是:

[1] "c->k,ou->uh,n->n,t->t,r->r,y->ee"
[2] "c->s,ou->uh,n->n,t->t,r->r,y->ee"

因为在字典中有两种可能的 c 发音方式。

【问题讨论】:

    标签: r permutation


    【解决方案1】:

    我以稍微不同的方式解决了这个问题,并添加了一些对极端情况的支持,例如相同的字符多次出现,以及需要在多个匹配项之间进行选择(通过取最长的匹配项)。请注意,我使用了 stringr 和 purrr 包中的一些函数。我确信该功能可以优化,但可能会让您开始...

    library(stringr)
    library(purrr)
    
    match_rec <- function(x, sound_dict) {
      if (!nzchar(x)) return("")
    
      # Helper variables
      key_matches <- c() # This can be optimized if number of possible matches is known
      char_keys   <- sound_dict[,1]
      unique_keys <- unique(char_keys)
    
      while(nzchar(x)) {
        # Find matches to beginning of string
        matches <- str_detect(x, paste0("^", unique_keys))
        if (any(matches)) {
          # Take the longest match
          char_match <- max(unique_keys[matches])
          key_matches <- c(key_matches, char_match)
          x <- str_sub(x, 1 + nchar(char_match))
        } else {
          x <- str_sub(x, 2)
        }
      }
    
      # Return all pronunciation permutations
      expand.grid(
        map(key_matches, ~ paste(., sound_dict[. == char_keys, 2], sep = "->"))
      )
    }
    

    一些输出示例...

    sounddef <- t(matrix(strsplit('c,k,c,s,ou,uh,n,n,t,t,r,r,y,ee,w,w,o,oh,o,uh,th,th,s,s,m,m',",")[[1]], nrow = 2))
    
    match_rec("country", sounddef)
    #>   Var1   Var2 Var3 Var4 Var5  Var6
    #> 1 c->k ou->uh n->n t->t r->r y->ee
    #> 2 c->s ou->uh n->n t->t r->r y->ee
    
    match_rec("counro", sounddef)
    #>   Var1   Var2 Var3 Var4  Var5
    #> 1 c->k ou->uh n->n r->r o->oh
    #> 2 c->s ou->uh n->n r->r o->oh
    #> 3 c->k ou->uh n->n r->r o->uh
    #> 4 c->s ou->uh n->n r->r o->uh
    
    match_rec("ccwouo", sounddef)
    #>   Var1 Var2 Var3   Var4  Var5
    #> 1 c->k c->k w->w ou->uh o->oh
    #> 2 c->s c->k w->w ou->uh o->oh
    #> 3 c->k c->s w->w ou->uh o->oh
    #> 4 c->s c->s w->w ou->uh o->oh
    #> 5 c->k c->k w->w ou->uh o->uh
    #> 6 c->s c->k w->w ou->uh o->uh
    #> 7 c->k c->s w->w ou->uh o->uh
    #> 8 c->s c->s w->w ou->uh o->uh
    
    match_rec("", sounddef)
    #> [1] ""
    

    【讨论】:

      【解决方案2】:

      我最终也尝试了一些不同的东西。这是一个比你想出的解决方案效率低的解决方案,所以我只会发布这个以防其他人想要参考。

      match_rec2 <- function(x, sounddef) {
        # Reduce sound dictionary to only possibly used sounds
        sr <- sounddef %>% subset(sapply(sounddef[,1], function(x) x %>% grepl(x)))
      
        # Loop through each character then each row in the dictionary
        for (i in 1:nchar(myword)) for (ii in 1:nrow(sr))
          x <- unique(c(x, str_replace(x, sr[ii,1], toupper(paste0(",", sr[ii,1],'->',sr[ii,2])))))
      
        tolower(substr(x[x==toupper(x)], 2, 100)) %>% 
          sapply(function(x) x %>% strsplit(',') %>% unlist) %>% t
      }
      
                                       [,1]   [,2]     [,3]   [,4]   [,5]   [,6]   
      c->k,ou->uh,n->n,t->t,r->r,y->ee "c->k" "ou->uh" "n->n" "t->t" "r->r" "y->ee"
      c->s,ou->uh,n->n,t->t,r->r,y->ee "c->s" "ou->uh" "n->n" "t->t" "r->r" "y->ee"
      
      # match_rec("country", sounddef)
      rbind(microbenchmark::microbenchmark(match_rec("country", sounddef)),
            microbenchmark::microbenchmark(match_rec2("country", sounddef)))  
      #Unit: microseconds
                                 expr       min        lq      mean    median        uq       max neval
       match_rec("country", sounddef)   994.215  1020.542  1167.747  1043.746  1440.897  1609.574   100
      match_rec2("country", sounddef) 41038.107 44909.427 52217.281 49015.023 54858.039 86680.030   100
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 2022-07-03
        • 2011-07-02
        • 1970-01-01
        • 1970-01-01
        • 2022-01-15
        • 2016-04-22
        • 2013-04-28
        相关资源
        最近更新 更多