【问题标题】：Count each occurrence of each element in a string计算字符串中每个元素的每次出现次数
【发布时间】：2019-10-18 11:39:12
【问题描述】：

我想计算字符串中每个字符的出现次数并将它们输出到数据框。

这是我的尝试：

q.data<-data.frame(number=1,string=c("COUNTTHESECHARACTERS"))
q.data[,3]<-string.counter(strings=q.data$string, pattern="A")
q.data[,3]<-string.counter(strings=q.data$string, pattern="B")

我想得到类似这样的输出：

x <- c("string","C","O","U","...")
colnames(df) <- x
df[1,] <- c("COUNTTHESECHARACTERS","3","1","1","...")
df

【问题讨论】：

标签： r string count find-occurrences

【解决方案1】：

我在您的示例中添加了另一行以获得更多变化。这应该非常有效：

library(tibble)
library(purrr)
library(dplyr)
library(stringr)

q_data <- tibble(number = 1:2, string = c("COUNTTHESECHARACTERS", "countthesecharacters"))

tmp_data <- map_df(q_data$string, function(s) {
  tmp <- t(str_count(s, fixed(LETTERS, ignore_case = TRUE)))
  tmp <- as_tibble(tmp, .name_repair = "minimal")
  colnames(tmp) <- LETTERS
  tmp
}) %>% 
  bind_rows()

q_data_new <- cbind(q_data, tmp_data)
q_data_new
#>   number               string A B C D E F G H I J K L M N O P Q R S T U V
#> 1      1 COUNTTHESECHARACTERS 2 0 3 0 3 0 0 2 0 0 0 0 0 1 1 0 0 2 2 3 1 0
#> 2      2 countthesecharacters 2 0 3 0 3 0 0 2 0 0 0 0 0 1 1 0 0 2 2 3 1 0
#>   W X Y Z
#> 1 0 0 0 0
#> 2 0 0 0 0

^{由reprex package (v0.3.0) 于 2019 年 10 月 18 日创建}

如果您从stringr 查找?str_count，您会看到更多可能对您有用的选项。

更新

我仅从另一个答案中意识到，您要做的是计算字符串的所有元素，而不仅仅是字母。在这种情况下，您基本上是在寻找一个文档特征矩阵：

library(quanteda)
tmp <- q_data$string %>% 
    tokens("character", remove_separators = FALSE) %>% 
    dfm() %>% 
    convert("data.frame") %>% 
    select(-document) %>% 
    select(noquote(order(colnames(.)))) %>% # this is just for ordering alpabetically
    as_tibble() # just for better comparison to other results
  q_data_new <- cbind(q_data, tmp)
  q_data_new

这比答案中已经给出的两个选项还要快得多。基准测试：

q_data <- tibble(number = 1:2000, string = stringi::stri_rand_strings(2000, 20))


stringr <- function(q_data, pattern = c(0:9, letters)) {

  tmp_data <- map_df(q_data$string, function(s) {
    tmp <- t(str_count(s, fixed(pattern, ignore_case = TRUE)))
    tmp <- as_tibble(tmp, .name_repair = "minimal")
    colnames(tmp) <- pattern
    tmp
  }) %>% 
    bind_rows() %>% 
    mutate_if(is.integer, as.numeric)

  q_data_new <- bind_cols(q_data, tmp_data)
  q_data_new
}

tidytext <- function(q_data) {

  q_data %>%
    group_by(number, string) %>%
    unnest_tokens(character, string, token = "characters", drop = FALSE) %>%
    count(number, character) %>%
    complete(character = letters) %>%
    spread(character, n, fill = 0) %>% 
    ungroup()

}

quanteda <- function(q_data) {
  tmp <- q_data$string %>% 
    tokens("character", remove_separators = FALSE) %>% 
    dfm() %>% 
    convert("data.frame") %>% 
    select(-document) %>% 
    select(noquote(order(colnames(.)))) %>% 
    as_tibble()
  q_data_new <- cbind(q_data, tmp)
  q_data_new
}

结果

res <- bench::mark(
  stringr = stringr(q_data),
  tidytext = tidytext(q_data),
  quanteda = quanteda(q_data)
)
res
#> # A tibble: 3 x 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 stringr       1.82s    1.82s     0.549   17.05MB     3.84
#> 2 tidytext      6.06s    6.06s     0.165   35.17MB     2.31
#> 3 quanteda     56.4ms  70.74ms    13.9      8.75MB     5.95

【讨论】：

感谢基准测试。我知道 tidytext 不是最快的，但它易于使用和理解，所以我倾向于在较小的设置中使用它。但有趣的是 quanteda 的速度有多快。
这里也一样。 tidytext 很棒而且很容易学习。基准测试主要是为了满足我自己的好奇心。

【解决方案2】：

基本上你想按字符标记字符串。那么就只是为了得到你想要的东西而进行一些操作。

library(dplyr)
library(tidyr)
library(tidytext)

q.data <- data.frame(number=c(1, 2),string=c("COUNTTHESECHARACTERS", "COUNTTHISTOO"), stringsAsFactors = FALSE)

q.data %>%
  group_by(number, string) %>%
  unnest_tokens(character, string, token = "characters", drop = FALSE) %>%
  count(number, character) %>%
  complete(character = letters) %>%
  spread(character, n, fill = 0)

# A tibble: 2 x 28
# Groups:   number, string [2]
  number string        a     b     c     d     e     f     g     h     i     j     k
   <dbl> <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1      1 COUNTTHE~     2     0     3     0     3     0     0     2     0     0     0
2      2 COUNTTHI~     0     0     1     0     0     0     0     1     1     0     0
# ... with 15 more variables: l <dbl>, m <dbl>, n <dbl>, o <dbl>, p <dbl>, q <dbl>,
#   r <dbl>, s <dbl>, t <dbl>, u <dbl>, v <dbl>, w <dbl>, x <dbl>, y <dbl>, z <dbl>

如果您想保留原生大小写的所有内容（即不转换为小写），则可以将to_lower = FALSE 添加到unnest_tokens()。

【讨论】：

tidytext 的好用处！