将表列表更改为 R 中的 data.frame答案

【问题标题】：Changing a list of tables to a data.frame in R将表列表更改为 R 中的 data.frame
【发布时间】：2020-03-12 16:22:58
【问题描述】：

下面，我首先查找变量X 和Y 的值是否重复少于4 次。我在low 中找到并列出了这些值。

我想知道，使用 BASE R，我如何将 low（tables 的列表）转换为 我想要的输出，如下所示？

注意：下面的数据是玩具，感谢功能性答案。

data <- data.frame(id = c(rep("AA",4), rep("BB",2), rep("CC",2)), X = c(1,1,1,1,1,1,3,3), 
                                                                  Y = c(9,9,9,7,6,6,6,6),
                                                                  Z = 1:8)
mods <- c("X","Y")
A <- setNames(lapply(seq_along(mods), function(i) table(data[[mods[i]]], dnn = NULL)), mods)

low <- setNames(lapply(seq_along(A), function(i) A[[i]][which(A[[i]] < 4)]), names(A))

期望的输出：

data.frame(id = c("CC", "AA", "AA"), value = c(3, 7, 9), var.name = c("X", "Y", "Y"), occur = c(2, 1, 3))

#   id value var.name occur     # `value` comes from the `names(low[[i]])`# i = 1,2                                 
# 1 CC     3        X     2     # `occur` comes from `as.numeric(low[[i]])`
# 2 AA     7        Y     1
# 3 AA     9        Y     3

【问题讨论】：

标签： r list function dataframe frequency

【解决方案1】：

我们将 'data' 列的子集拆分为 'id'，使用 lapply 循环遍历 list，使用对应的 stacked 'low' list 使用 merge 进行内部连接tables、Filter 中的行数为 0 或 length 0 的元素创建“lst1”。从“lst1”，使用Map 和rbind 元素从内部和外部names 创建附加列

lst1 <- Filter(length, lapply(split(data[c('X', 'Y')], data$id), 
     function(dat) Filter(nrow, Map(merge, lapply(dat, 
        function(x) stack(table(x))), lapply(low, stack)))))

do.call(rbind, c(Map(cbind, id = names(lst1), lapply(lst1, 
   function(x) do.call(rbind, c(Map(cbind, x, var.name = names(x)),
          make.row.names = FALSE)))), make.row.names = FALSE))
#  id values ind var.name
#1 AA      1   7        Y
#2 AA      3   9        Y
#3 CC      2   3        X

【讨论】：

@rnorouzian 我添加了一些解释。希望对你有帮助
@rnorouzian 原因是它在“数据”subset(do.call(rbind, lapply(split(data[mods], data$study.name), function(dat) lapply(dat, function(x) stack(table(x)))$cf.type)), values == 2 & ind == 15)# [1] values ind <0 rows> (or 0-length row.names) 中不存在。我使用了来自 github 链接的数据
@rnorouzian。它会在那里，但我的意思是values ==2 和ind == 15。在这里，我们用'id'分割，如果你检查'id'，它是26和35，所以频率只有1
@rnorouzian 原因是频率subset(data, cf.type==99)$id [1] 32 322 相同的id，和low$cf.type3 15 99 2 2 匹配
@rnorouzian 是不是表示你不想匹配'low'中的那些？

【解决方案2】：

data <- data.frame(id = c(rep("AA",4), rep("BB",2), rep("CC",2)), X = c(1,1,1,1,1,1,3,3), 
                   Y = c(9,9,9,7,6,6,6,6),
                   Z = 1:8)

to_check <- setdiff(names(data), "id")
results <- vector(mode = "list", length = length(to_check))

# This function sorts the input and then uses the run-length
# encoding to determining values and their frequencies.
# table is an option, but the output is a lot harder
# to work with.
count_occurrences <- function(x, max_occurrences = 3L) {
  x <- sort(x)
  run_lengths <- rle(x)
  chosen <- which(run_lengths$lengths <= max_occurrences)
  if (length(chosen)) {
    values <- run_lengths[["values"]][chosen]
    occur <- run_lengths[["lengths"]][chosen]
  } else {
    values <- NA
    occur <- NA
  }
  data.frame(value = values, occur = occur)
}
for (k in seq_along(results)) {
# `tapply` will split the first vector based upon the values in `INDEX`
counts <- tapply(data[[to_check[k]]], 
                 INDEX = data$id,
                 FUN = count_occurrences,
                 max_occurrences = 3)
# Construct a data.frame of the results, repeating each name for the number
# of rows (values meeting the criterion) returned. 
# I've used `unlist(sapply(...)` for the other two because
# not every result in the list will have the same number of rows,
# and `vapply()` requires specifying the output type and shape.
results[[k]] <- data.frame(id = rep(names(counts), times = vapply(counts, nrow, integer(1L))),
                           value = unlist(sapply(counts, `[[`, "value",
                                          USE.NAMES = FALSE), use.names = FALSE),
                           occur = unlist(sapply(counts, `[[`, "occur",
                                          USE.NAMES = FALSE), use.names = FALSE))
results[[k]]["var.name"] <- to_check[k]
}
desired_result <- Reduce(rbind, results)
desired_result
#    id value occur var.name
# 1  AA    NA    NA        X
# 2  BB     1     2        X
# 3  CC     3     2        X
# 4  AA     7     1        Y
# 5  AA     9     3        Y
# 6  BB     6     2        Y
# 7  CC     6     2        Y
# 8  AA     1     1        Z
# 9  AA     2     1        Z
# 10 AA     3     1        Z
# 11 AA     4     1        Z
# 12 BB     5     1        Z
# 13 BB     6     1        Z
# 14 CC     7     1        Z
# 15 CC     8     1        Z
# subset as [!is.na(value)] to drop the ids with no values with frequencies less
# than `max_occurrence`

【讨论】：