查找单词的每个发音排列答案

【问题标题】：Finding every pronounciation permutation of a word查找单词的每个发音排列
【发布时间】：2017-03-16 21:30:38
【问题描述】：

我正在尝试编写一个函数，该函数使用发音词典生成单词的所有可能发音排列。

# Dictionary
sounddef <- t(matrix(strsplit('c,k,c,s,ou,uh,n,n,t,t,r,r,y,ee,w,w,o,oh,o,uh,th,th,s,s,m,m',",")[[1]], nrow = 2))

# The first column is the written letter and the second is a possible pronunciation.

match_rec <- function(x, sounddef) {
  if (!nzchar(x)) return("")
  returner <- NULL
  for (i in 1:nrow(sounddef)) {
    v <- sounddef[i,]
    char <- paste0("^",v[1])

    if (grepl(char, x)) 
      returner  <- c(returner, paste0(v[1],'->',v[2], ",", 
                                      match_rec(gsub(char, "", x), sounddef), collapse=""))
  }
  returner
}

# Unfortunately this does not return the right values
match_rec("country", sounddef)
[1] "c->k,ou->uh,n->n,t->t,r->r,y->ee,c->k,o->oh,c->k,o->uh,"
[2] "c->s,ou->uh,n->n,t->t,r->r,y->ee,c->s,o->oh,c->s,o->uh,"

它应该返回的值是：

[1] "c->k,ou->uh,n->n,t->t,r->r,y->ee"
[2] "c->s,ou->uh,n->n,t->t,r->r,y->ee"

因为在字典中有两种可能的 c 发音方式。

【问题讨论】：

标签： r permutation

【解决方案1】：

我以稍微不同的方式解决了这个问题，并添加了一些对极端情况的支持，例如相同的字符多次出现，以及需要在多个匹配项之间进行选择（通过取最长的匹配项）。请注意，我使用了 stringr 和 purrr 包中的一些函数。我确信该功能可以优化，但可能会让您开始...

library(stringr)
library(purrr)

match_rec <- function(x, sound_dict) {
  if (!nzchar(x)) return("")

  # Helper variables
  key_matches <- c() # This can be optimized if number of possible matches is known
  char_keys   <- sound_dict[,1]
  unique_keys <- unique(char_keys)

  while(nzchar(x)) {
    # Find matches to beginning of string
    matches <- str_detect(x, paste0("^", unique_keys))
    if (any(matches)) {
      # Take the longest match
      char_match <- max(unique_keys[matches])
      key_matches <- c(key_matches, char_match)
      x <- str_sub(x, 1 + nchar(char_match))
    } else {
      x <- str_sub(x, 2)
    }
  }

  # Return all pronunciation permutations
  expand.grid(
    map(key_matches, ~ paste(., sound_dict[. == char_keys, 2], sep = "->"))
  )
}

一些输出示例...

sounddef <- t(matrix(strsplit('c,k,c,s,ou,uh,n,n,t,t,r,r,y,ee,w,w,o,oh,o,uh,th,th,s,s,m,m',",")[[1]], nrow = 2))

match_rec("country", sounddef)
#>   Var1   Var2 Var3 Var4 Var5  Var6
#> 1 c->k ou->uh n->n t->t r->r y->ee
#> 2 c->s ou->uh n->n t->t r->r y->ee

match_rec("counro", sounddef)
#>   Var1   Var2 Var3 Var4  Var5
#> 1 c->k ou->uh n->n r->r o->oh
#> 2 c->s ou->uh n->n r->r o->oh
#> 3 c->k ou->uh n->n r->r o->uh
#> 4 c->s ou->uh n->n r->r o->uh

match_rec("ccwouo", sounddef)
#>   Var1 Var2 Var3   Var4  Var5
#> 1 c->k c->k w->w ou->uh o->oh
#> 2 c->s c->k w->w ou->uh o->oh
#> 3 c->k c->s w->w ou->uh o->oh
#> 4 c->s c->s w->w ou->uh o->oh
#> 5 c->k c->k w->w ou->uh o->uh
#> 6 c->s c->k w->w ou->uh o->uh
#> 7 c->k c->s w->w ou->uh o->uh
#> 8 c->s c->s w->w ou->uh o->uh

match_rec("", sounddef)
#> [1] ""

【讨论】：

【解决方案2】：

我最终也尝试了一些不同的东西。这是一个比你想出的解决方案效率低的解决方案，所以我只会发布这个以防其他人想要参考。

match_rec2 <- function(x, sounddef) {
  # Reduce sound dictionary to only possibly used sounds
  sr <- sounddef %>% subset(sapply(sounddef[,1], function(x) x %>% grepl(x)))

  # Loop through each character then each row in the dictionary
  for (i in 1:nchar(myword)) for (ii in 1:nrow(sr))
    x <- unique(c(x, str_replace(x, sr[ii,1], toupper(paste0(",", sr[ii,1],'->',sr[ii,2])))))

  tolower(substr(x[x==toupper(x)], 2, 100)) %>% 
    sapply(function(x) x %>% strsplit(',') %>% unlist) %>% t
}

                                 [,1]   [,2]     [,3]   [,4]   [,5]   [,6]   
c->k,ou->uh,n->n,t->t,r->r,y->ee "c->k" "ou->uh" "n->n" "t->t" "r->r" "y->ee"
c->s,ou->uh,n->n,t->t,r->r,y->ee "c->s" "ou->uh" "n->n" "t->t" "r->r" "y->ee"

# match_rec("country", sounddef)
rbind(microbenchmark::microbenchmark(match_rec("country", sounddef)),
      microbenchmark::microbenchmark(match_rec2("country", sounddef)))  
#Unit: microseconds
                           expr       min        lq      mean    median        uq       max neval
 match_rec("country", sounddef)   994.215  1020.542  1167.747  1043.746  1440.897  1609.574   100
match_rec2("country", sounddef) 41038.107 44909.427 52217.281 49015.023 54858.039 86680.030   100

【讨论】：