如何从R中的数据框制作频率表答案

【问题标题】：How to make a frequency table from a data frame in R如何从R中的数据框制作频率表
【发布时间】：2020-01-24 02:23:12
【问题描述】：

数据框是这样的： enter image description here

header: system
Row 1:  00000000000000000503_0
Row 2:  00000000000000000503_1
Row 3:  00000000000000000503_2
Row 4:  00000000000000000503_3
Row 5:  000000000000000004e7_0
Row 6:  000000000000000004e7_1
Row 7:  00000000000000000681_0
Row 8:  00000000000000000681_1
Row 9:  00000000000000000681_2

我想生成一个频率表，其中包含字符串“_”之前的代码数量，这样：

“00000000000000000503”出现4次，“000000000000000004e7”出现2次，以此类推。

我如何在 R 中做到这一点？

【问题讨论】：

尝试table(sapply(strsplit(df$system, "_"),"[[", 1))，但需要dput的可重现数据
感谢您的所有回答！真的很感谢大家

标签： r dataframe frequency

【解决方案1】：

str_remove 和 group_by 的选项

library(stringr)
library(dplyr)
df %>%
    group_by(V3 = str_remove(V3, "_\\d+$")) %>% 
    summarise(n = n())
# A tibble: 3 x 2
#  V3                       n
#  <chr>                <int>
#1 000000000000000004e7     2
#2 00000000000000000503     4
#3 00000000000000000681     3

或者在base R 中加上table 和trimws

table(trimws(df$V3, whitespace = "_[0-9]+"))

数据

df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row", 
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:", 
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1", 
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0", 
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1", 
"00000000000000000681_2")), class = "data.frame", row.names = c(NA, 
-9L))

【讨论】：

【解决方案2】：

一个整洁的答案是

my_data <-  mydata %>% 
  mutate_if(is.factor, as.character) %>%
  mutate(system = gsub('_[^_]*$', '', system)) %>%
  group_by(system) %>%
  count() %>%
  ungroup()
my_data

【讨论】：

【解决方案3】：

使用包含在tidyverse 中的stringr 库的另一个选项

> library(tidyverse)
> mydata <- data.frame(system = c("00000000000000000503_0",
                      "00000000000000000503_1",
                      "00000000000000000503_2",
                      "00000000000000000503_3",
                      "000000000000000004e7_0",
                      "000000000000000004e7_1",
                      "00000000000000000681_0",
                      "00000000000000000681_1",
                      "00000000000000000681_2"))
> mydata
                  system
1 00000000000000000503_0
2 00000000000000000503_1
3 00000000000000000503_2
4 00000000000000000503_3
5 000000000000000004e7_0
6 000000000000000004e7_1
7 00000000000000000681_0
8 00000000000000000681_1
9 00000000000000000681_2
> # Split data using str_split
> mydata$leftside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[1]) #split string by the "_" and take first piece
> mydata$rightside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[2]) #split string by the "_" and take second piece
> 
> mydata
                  system             leftside rightside
1 00000000000000000503_0 00000000000000000503         0
2 00000000000000000503_1 00000000000000000503         1
3 00000000000000000503_2 00000000000000000503         2
4 00000000000000000503_3 00000000000000000503         3
5 000000000000000004e7_0 000000000000000004e7         0
6 000000000000000004e7_1 000000000000000004e7         1
7 00000000000000000681_0 00000000000000000681         0
8 00000000000000000681_1 00000000000000000681         1
9 00000000000000000681_2 00000000000000000681         2

> # alternative tabulate fuction than base::table(). Can Provide nicer options.
> xtabs(data = mydata, formula = ~leftside)
leftside
000000000000000004e7 00000000000000000503 00000000000000000681 
                   2                    4                    3

【讨论】：

【解决方案4】：

dplyr-tidyr 替代方案：

df %>% 
  tidyr::separate(V3, c("target", "non_target")) %>% 
   count(target)
# A tibble: 3 x 2
  target                   n
  <chr>                <int>
1 000000000000000004e7     2
2 00000000000000000503     4
3 00000000000000000681     3

base:

table(sapply(strsplit(df$system, "_"),"[[", 1))

数据：

df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row", 
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:", 
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1", 
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0", 
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1", 
"00000000000000000681_2")), class = "data.frame", row.names = c(NA, 
-9L))

【讨论】：

【解决方案5】：

删除下划线后的所有内容并使用table计算频率

table(sub("_.*", "", data$col1))
#Also
#table(sub("(.*)_.*", "\\1", data$col1))

#000000000000000004e7 00000000000000000503 00000000000000000681 
#                   2                    4                    3

如果最终输出需要是数据框，请使用stack

stack(table(sub("_.*", "", data$col1)))

#  values                  ind
#1      2 000000000000000004e7
#2      4 00000000000000000503
#3      3 00000000000000000681

数据

data <- structure(list(col1 = structure(c(3L, 4L, 5L, 6L, 1L, 2L, 7L, 
8L, 9L), .Label = c("000000000000000004e7_0", "000000000000000004e7_1", 
"00000000000000000503_0", "00000000000000000503_1", 
"00000000000000000503_2", 
"00000000000000000503_3", "00000000000000000681_0", 
"00000000000000000681_1", 
"00000000000000000681_2"), class = "factor")), class = "data.frame", 
row.names = c(NA, -9L))

【讨论】：