【问题标题】:Count each occurrence of each element in a string计算字符串中每个元素的每次出现次数
【发布时间】:2019-10-18 11:39:12
【问题描述】:

我想计算字符串中每个字符的出现次数并将它们输出到数据框。

这是我的尝试:

q.data<-data.frame(number=1,string=c("COUNTTHESECHARACTERS"))
q.data[,3]<-string.counter(strings=q.data$string, pattern="A")
q.data[,3]<-string.counter(strings=q.data$string, pattern="B")

我想得到类似这样的输出:

x <- c("string","C","O","U","...")
colnames(df) <- x
df[1,] <- c("COUNTTHESECHARACTERS","3","1","1","...")
df

【问题讨论】:

    标签: r string count find-occurrences


    【解决方案1】:

    我在您的示例中添加了另一行以获得更多变化。这应该非常有效:

    library(tibble)
    library(purrr)
    library(dplyr)
    library(stringr)
    
    q_data <- tibble(number = 1:2, string = c("COUNTTHESECHARACTERS", "countthesecharacters"))
    
    tmp_data <- map_df(q_data$string, function(s) {
      tmp <- t(str_count(s, fixed(LETTERS, ignore_case = TRUE)))
      tmp <- as_tibble(tmp, .name_repair = "minimal")
      colnames(tmp) <- LETTERS
      tmp
    }) %>% 
      bind_rows()
    
    q_data_new <- cbind(q_data, tmp_data)
    q_data_new
    #>   number               string A B C D E F G H I J K L M N O P Q R S T U V
    #> 1      1 COUNTTHESECHARACTERS 2 0 3 0 3 0 0 2 0 0 0 0 0 1 1 0 0 2 2 3 1 0
    #> 2      2 countthesecharacters 2 0 3 0 3 0 0 2 0 0 0 0 0 1 1 0 0 2 2 3 1 0
    #>   W X Y Z
    #> 1 0 0 0 0
    #> 2 0 0 0 0
    

    reprex package (v0.3.0) 于 2019 年 10 月 18 日创建

    如果您从stringr 查找?str_count,您会看到更多可能对您有用的选项。

    更新

    我仅从另一个答案中意识到,您要做的是计算字符串的所有元素,而不仅仅是字母。在这种情况下,您基本上是在寻找一个文档特征矩阵:

    library(quanteda)
    tmp <- q_data$string %>% 
        tokens("character", remove_separators = FALSE) %>% 
        dfm() %>% 
        convert("data.frame") %>% 
        select(-document) %>% 
        select(noquote(order(colnames(.)))) %>% # this is just for ordering alpabetically
        as_tibble() # just for better comparison to other results
      q_data_new <- cbind(q_data, tmp)
      q_data_new
    

    这比答案中已经给出的两个选项还要快得多。基准测试:

    q_data <- tibble(number = 1:2000, string = stringi::stri_rand_strings(2000, 20))
    
    
    stringr <- function(q_data, pattern = c(0:9, letters)) {
    
      tmp_data <- map_df(q_data$string, function(s) {
        tmp <- t(str_count(s, fixed(pattern, ignore_case = TRUE)))
        tmp <- as_tibble(tmp, .name_repair = "minimal")
        colnames(tmp) <- pattern
        tmp
      }) %>% 
        bind_rows() %>% 
        mutate_if(is.integer, as.numeric)
    
      q_data_new <- bind_cols(q_data, tmp_data)
      q_data_new
    }
    
    tidytext <- function(q_data) {
    
      q_data %>%
        group_by(number, string) %>%
        unnest_tokens(character, string, token = "characters", drop = FALSE) %>%
        count(number, character) %>%
        complete(character = letters) %>%
        spread(character, n, fill = 0) %>% 
        ungroup()
    
    }
    
    quanteda <- function(q_data) {
      tmp <- q_data$string %>% 
        tokens("character", remove_separators = FALSE) %>% 
        dfm() %>% 
        convert("data.frame") %>% 
        select(-document) %>% 
        select(noquote(order(colnames(.)))) %>% 
        as_tibble()
      q_data_new <- cbind(q_data, tmp)
      q_data_new
    }
    

    结果

    res <- bench::mark(
      stringr = stringr(q_data),
      tidytext = tidytext(q_data),
      quanteda = quanteda(q_data)
    )
    res
    #> # A tibble: 3 x 6
    #>   expression      min   median `itr/sec` mem_alloc `gc/sec`
    #>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
    #> 1 stringr       1.82s    1.82s     0.549   17.05MB     3.84
    #> 2 tidytext      6.06s    6.06s     0.165   35.17MB     2.31
    #> 3 quanteda     56.4ms  70.74ms    13.9      8.75MB     5.95
    

    【讨论】:

    • 感谢基准测试。我知道 tidytext 不是最快的,但它易于使用和理解,所以我倾向于在较小的设置中使用它。但有趣的是 quanteda 的速度有多快。
    • 这里也一样。 tidytext 很棒而且很容易学习。基准测试主要是为了满足我自己的好奇心。
    【解决方案2】:

    基本上你想按字符标记字符串。那么就只是为了得到你想要的东西而进行一些操作。

    library(dplyr)
    library(tidyr)
    library(tidytext)
    
    q.data <- data.frame(number=c(1, 2),string=c("COUNTTHESECHARACTERS", "COUNTTHISTOO"), stringsAsFactors = FALSE)
    
    q.data %>%
      group_by(number, string) %>%
      unnest_tokens(character, string, token = "characters", drop = FALSE) %>%
      count(number, character) %>%
      complete(character = letters) %>%
      spread(character, n, fill = 0)
    
    # A tibble: 2 x 28
    # Groups:   number, string [2]
      number string        a     b     c     d     e     f     g     h     i     j     k
       <dbl> <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
    1      1 COUNTTHE~     2     0     3     0     3     0     0     2     0     0     0
    2      2 COUNTTHI~     0     0     1     0     0     0     0     1     1     0     0
    # ... with 15 more variables: l <dbl>, m <dbl>, n <dbl>, o <dbl>, p <dbl>, q <dbl>,
    #   r <dbl>, s <dbl>, t <dbl>, u <dbl>, v <dbl>, w <dbl>, x <dbl>, y <dbl>, z <dbl>
    

    如果您想保留原生大小写的所有内容(即不转换为小写),则可以将to_lower = FALSE 添加到unnest_tokens()

    【讨论】:

    • tidytext 的好用处!
    猜你喜欢
    • 2022-10-15
    • 1970-01-01
    • 2016-01-23
    • 1970-01-01
    • 1970-01-01
    • 2020-07-23
    • 2021-12-19
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多