【问题标题】:How to make a frequency table from a data frame in R如何从R中的数据框制作频率表
【发布时间】:2020-01-24 02:23:12
【问题描述】:

数据框是这样的: enter image description here

header: system
Row 1:  00000000000000000503_0
Row 2:  00000000000000000503_1
Row 3:  00000000000000000503_2
Row 4:  00000000000000000503_3
Row 5:  000000000000000004e7_0
Row 6:  000000000000000004e7_1
Row 7:  00000000000000000681_0
Row 8:  00000000000000000681_1
Row 9:  00000000000000000681_2

我想生成一个频率表,其中包含字符串“_”之前的代码数量,这样:

“00000000000000000503”出现4次,“000000000000000004e7”出现2次,以此类推。

我如何在 R 中做到这一点?

【问题讨论】:

  • 尝试table(sapply(strsplit(df$system, "_"),"[[", 1)),但需要dput的可重现数据
  • 感谢您的所有回答!真的很感谢大家

标签: r dataframe frequency


【解决方案1】:

str_removegroup_by 的选项

library(stringr)
library(dplyr)
df %>%
    group_by(V3 = str_remove(V3, "_\\d+$")) %>% 
    summarise(n = n())
# A tibble: 3 x 2
#  V3                       n
#  <chr>                <int>
#1 000000000000000004e7     2
#2 00000000000000000503     4
#3 00000000000000000681     3

或者在base R 中加上tabletrimws

table(trimws(df$V3, whitespace = "_[0-9]+"))

数据

df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row", 
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:", 
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1", 
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0", 
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1", 
"00000000000000000681_2")), class = "data.frame", row.names = c(NA, 
-9L))

【讨论】:

    【解决方案2】:

    一个整洁的答案是

    my_data <-  mydata %>% 
      mutate_if(is.factor, as.character) %>%
      mutate(system = gsub('_[^_]*$', '', system)) %>%
      group_by(system) %>%
      count() %>%
      ungroup()
    my_data
    

    【讨论】:

      【解决方案3】:

      使用包含在tidyverse 中的stringr 库的另一个选项

      > library(tidyverse)
      > mydata <- data.frame(system = c("00000000000000000503_0",
                            "00000000000000000503_1",
                            "00000000000000000503_2",
                            "00000000000000000503_3",
                            "000000000000000004e7_0",
                            "000000000000000004e7_1",
                            "00000000000000000681_0",
                            "00000000000000000681_1",
                            "00000000000000000681_2"))
      > mydata
                        system
      1 00000000000000000503_0
      2 00000000000000000503_1
      3 00000000000000000503_2
      4 00000000000000000503_3
      5 000000000000000004e7_0
      6 000000000000000004e7_1
      7 00000000000000000681_0
      8 00000000000000000681_1
      9 00000000000000000681_2
      > # Split data using str_split
      > mydata$leftside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[1]) #split string by the "_" and take first piece
      > mydata$rightside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[2]) #split string by the "_" and take second piece
      > 
      > mydata
                        system             leftside rightside
      1 00000000000000000503_0 00000000000000000503         0
      2 00000000000000000503_1 00000000000000000503         1
      3 00000000000000000503_2 00000000000000000503         2
      4 00000000000000000503_3 00000000000000000503         3
      5 000000000000000004e7_0 000000000000000004e7         0
      6 000000000000000004e7_1 000000000000000004e7         1
      7 00000000000000000681_0 00000000000000000681         0
      8 00000000000000000681_1 00000000000000000681         1
      9 00000000000000000681_2 00000000000000000681         2
      
      > # alternative tabulate fuction than base::table(). Can Provide nicer options.
      > xtabs(data = mydata, formula = ~leftside)
      leftside
      000000000000000004e7 00000000000000000503 00000000000000000681 
                         2                    4                    3 
      

      【讨论】:

        【解决方案4】:

        dplyr-tidyr 替代方案:

        df %>% 
          tidyr::separate(V3, c("target", "non_target")) %>% 
           count(target)
        # A tibble: 3 x 2
          target                   n
          <chr>                <int>
        1 000000000000000004e7     2
        2 00000000000000000503     4
        3 00000000000000000681     3
        

        base:

        table(sapply(strsplit(df$system, "_"),"[[", 1))
        

        数据:

        df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row", 
        "Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:", 
        "7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1", 
        "00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0", 
        "000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1", 
        "00000000000000000681_2")), class = "data.frame", row.names = c(NA, 
        -9L))
        

        【讨论】:

          【解决方案5】:

          删除下划线后的所有内容并使用table计算频率

          table(sub("_.*", "", data$col1))
          #Also
          #table(sub("(.*)_.*", "\\1", data$col1))
          
          #000000000000000004e7 00000000000000000503 00000000000000000681 
          #                   2                    4                    3 
          

          如果最终输出需要是数据框,请使用stack

          stack(table(sub("_.*", "", data$col1)))
          
          #  values                  ind
          #1      2 000000000000000004e7
          #2      4 00000000000000000503
          #3      3 00000000000000000681
          

          数据

          data <- structure(list(col1 = structure(c(3L, 4L, 5L, 6L, 1L, 2L, 7L, 
          8L, 9L), .Label = c("000000000000000004e7_0", "000000000000000004e7_1", 
          "00000000000000000503_0", "00000000000000000503_1", 
          "00000000000000000503_2", 
          "00000000000000000503_3", "00000000000000000681_0", 
          "00000000000000000681_1", 
          "00000000000000000681_2"), class = "factor")), class = "data.frame", 
          row.names = c(NA, -9L))
          

          【讨论】:

            猜你喜欢
            • 2016-11-26
            • 2023-04-10
            • 1970-01-01
            • 2012-11-17
            • 1970-01-01
            • 1970-01-01
            • 1970-01-01
            • 1970-01-01
            • 1970-01-01
            相关资源
            最近更新 更多