对同一数据框中的多列执行卡方检验答案

【问题标题】：Perform Chi Square Tests on Multiple Columns from the Same Data Frame对同一数据框中的多列执行卡方检验
【发布时间】：2019-09-17 17:26:41
【问题描述】：

我正在尝试编写一个函数，该函数将对现有数据框中的列进行分组，并对每列的关联矩阵/列联表进行卡方检验，然后报告每个检验的 p 值。我一直在尝试模仿使用here 的方法，但我发现我的 M 值没有像我认为我需要的那样被格式化为矩阵。我不确定是不是因为我的列比上面链接中的示例多，或者我只是遗漏了一些东西，但这是我目前拥有的数据结构示例：

require(lubridate)

structure(list(ResponseID = c("R_2fpKxLYlxAoplxP", "R_enci4Hwwee9XLSp", 
"R_332X6CmsgY6RE5s", "R_3GAI7CSx4a74LVp", "R_2QXRRBh4UCFoHDl", 
"R_3gSKU8piHOKWf9E"), region = structure(c(1L, 1L, 1L, 1L, 1L, 
1L), .Label = c("Lakeland", "Macon SE", "Other"), class = "factor"), 
    InCalls_Qrtl = structure(c(7L, 7L, 7L, 7L, 7L, 7L), .Label = c("NA", 
    "No EDGE Calls", "Bottom Quartile", "Second Quartile", "Third Quartile", 
    "Top Quartile", "Missing"), class = "factor"), InAHT_Qrtl = structure(c(7L, 
    7L, 7L, 7L, 7L, 7L), .Label = c("NA", "No EDGE Calls", "Bottom Quartile", 
    "Second Quartile", "Third Quartile", "Top Quartile", "Missing"
    ), class = "factor"), InHold_Qrtl = structure(c(7L, 7L, 7L, 
    7L, 7L, 7L), .Label = c("NA", "No EDGE Calls", "Bottom Quartile", 
    "Second Quartile", "Third Quartile", "Top Quartile", "Missing"
    ), class = "factor"), Overall_Tenure_Period = new("Period", 
        .Data = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
        NA_real_), year = c(NA_real_, NA_real_, NA_real_, NA_real_, 
        NA_real_, NA_real_), month = c(NA_real_, NA_real_, NA_real_, 
        NA_real_, NA_real_, NA_real_), day = c(NA_real_, NA_real_, 
        NA_real_, NA_real_, NA_real_, NA_real_), hour = c(NA_real_, 
        NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), minute = c(NA_real_, 
        NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), Recent_Tenure_Period = new("Period", 
        .Data = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, 
        NA_real_), year = c(NA_real_, NA_real_, NA_real_, NA_real_, 
        NA_real_, NA_real_), month = c(NA_real_, NA_real_, NA_real_, 
        NA_real_, NA_real_, NA_real_), day = c(NA_real_, NA_real_, 
        NA_real_, NA_real_, NA_real_, NA_real_), hour = c(NA_real_, 
        NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), minute = c(NA_real_, 
        NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), VOA_Overall_Tenure_Orig = structure(c(9L, 
    9L, 9L, 9L, 9L, 9L), .Label = c("< 1 Year", "1 Year - <2 Years", 
    "2-5 Years", "6-10 Years", "11-15 Years", "16-20 Years", 
    "21-25 Years", "26 Years or Longer", "Missing"), class = "factor"), 
    VOA_Overall_Tenure_Mod = structure(c(5L, 5L, 5L, 5L, 5L, 
    5L), .Label = c("<2 Years", "2-5 Years", "6-10 Years", ">10 Years", 
    "Missing"), class = "factor"), VOA_Recent_Tenure_Orig = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_), VOA_Recent_Tenure_Mod = c(NA_character_, 
    NA_character_, NA_character_, NA_character_, NA_character_, 
    NA_character_), Region = c("Lakeland", "Lakeland", "Lakeland", 
    "Lakeland", "Lakeland", "Lakeland"), Tenure_Code_Least = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), Tenure_Code_Most = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), Tenure_Code_Split = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Least = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Top = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Top2 = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Least = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Top = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Top2 = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Least = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Top = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Top2 = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), Question = c("Overall, how satisfied are you with using EDGE?", 
    "Overall, how satisfied are you with using EDGE?", "Overall, how satisfied are you with using EDGE?", 
    "Overall, how satisfied are you with using EDGE?", "Overall, how satisfied are you with using EDGE?", 
    "Overall, how satisfied are you with using EDGE?"), Answer = c("Slightly Satisfied", 
    "Slightly Satisfied", "Slightly Satisfied", "Dissatisfied", 
    "Completely Dissatisfied", "Slightly Dissatisfied"), Answer_TopBox = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), Answer_Top2 = c("Rest", 
    "Rest", "Rest", "Rest", "Rest", "Rest"), Answer_GenSat = c("GenSat", 
    "GenSat", "GenSat", "Rest", "Rest", "Rest"), Answer_Bottom = c("Rest", 
    "Rest", "Rest", "Rest", "Bottom", "Rest"), Answer_Bottom2 = c("Rest", 
    "Rest", "Rest", "Bottom2", "Rest", "Rest"), Answer_GenDissat = c("Rest", 
    "Rest", "Rest", "GenDissat", "GenDissat", "GenDissat")), row.names = c(NA, 
6L), class = "data.frame")

然后我尝试通过执行以下操作重新创建示例：

top_score_tests_agent <- as.data.frame(agent_data_clean_coded %>%
  group_by(Region, Tenure_Code_Least, Tenure_Code_Most, Tenure_Code_Split,InCalls_Least, InCalls_Top, InCalls_Top2, InAHT_Least, 
           InAHT_Top, InAHT_Top2, InHold_Least, InHold_Top, InHold_Top2, Question, Answer_TopBox) %>%
  summarise(freq = n())) %>% group_by(Tenure_Code_Least, Tenure_Code_Most, Tenure_Code_Split,InCalls_Least, InCalls_Top, InCalls_Top2, InAHT_Least, 
                                      InAHT_Top, InAHT_Top2, InHold_Least, InHold_Top, InHold_Top2, Question) %>%
  nest() %>%
  mutate(M = map(data, function(dat){
    dat2 <- dat %>% spread(Region, freq)
    M <- as.matrix(dat2[, -1])
    row.names(M) <- dat2$Answer_TopBox
    return(M)
  }))

但我发现，如果我尝试通过检查 top_score_tests_agent$M[[1]] 结果来检查矩阵创建，我会得到以下输出：

structure(4L, .Dim = c(1L, 1L), .Dimnames = list(NULL, "Top Box"))

我只是想知道是否有人对我做错了什么会阻止我的矩阵创建有任何见解，或者是否有人有任何其他他们曾经这样做过的方法？

编辑

我能够使用@Wietze314 编写的大部分代码，但对于任何有兴趣查看最终代码的未来用户：

result2 <- df %>% select(Region, Question, starts_with("Answer")) %>%
  gather(segment, answer,-Region, -Question) %>%
  group_by(Question, segment) %>%
  nest() %>%
  mutate(test = map(data, ~chisq.test(.x$Region,.x$answer, correct=FALSE))) %>%
  mutate(p = map_dbl(test, pluck,'p.value'),
         Status = ifelse(p<=0.01, "99% Sig Difference", ifelse(
           p>0.01 & p<=0.05, "95% Sig Difference", ifelse(
             p>0.05 & p<=0.1, "90% Sig Difference", "Not Significant")))) %>%
  select(-data, -test)

这给了我一个看起来像这样的输出：

structure(list(Question = c("I feel comfortable 'trusting the system' with EDGE", 
"EDGE allows me to be more efficient", "The E-learning training (GU Courses)", 
"When I have questions about EDGE, I feel confident they will be answered", 
"Overall, the training I received prepared me to use EDGE", "Overall, how satisfied are you with using EDGE?", 
"The in-person, instructor-led training", "The formal training you received in EDGE", 
"EDGE allows me to be more efficient", "I feel comfortable 'trusting the system' with EDGE"
), Segment = c("Answer_GenDissat", "Answer_Bottom", "Answer_GenSat", 
"Answer_TopBox", "Answer_GenDissat", "Answer_TopBox", "Answer_Top2", 
"Answer_Top2", "Answer_Bottom2", "Answer_GenSat"), pvalue = c(0.231403084430793, 
0.299890413606335, 0.00108798852510237, 0.487810952072342, 0.131641662666334, 
0.31818165042123, 0.501077891603077, 0.634730681199174, 0.389259022098406, 
0.274277276570632), Status = c("Not Significant", "Not Significant", 
"99% Sig Difference", "Not Significant", "Not Significant", "Not Significant", 
"Not Significant", "Not Significant", "Not Significant", "Not Significant"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-10L))

【问题讨论】：

从您的数据和问题来看，您并不清楚您要与 chisq.test 比较哪些变量以及数据的哪些子集。您链接的示例使用长数据格式，并针对每个ecotype 和contigID 组合比较变量allele 和sex。
我不明白这个问题。该结构对象是一个矩阵。 is.matrix( structure(4L, .Dim = c(1L, 1L), .Dimnames = list(NULL, "Top Box")) ) #[1] TRUE
此外，将更大的结构分配给对象名称的努力失败了：Error in getClass(Class, where = topenv(parent.frame())) : “Period” is not a defined class
抱歉，我要比较的变量是 Region/Tenure/InCall/InAHT/InHold 变量与不同的 Answer 变量。例如，应将 Region 字段分组为 Answer_TopBox、Answer_Top2、Answer_GenSat 等的 2x2 矩阵，这样我最终将得到总共 6 个矩阵，可用于进行 chi sq 测试。
@42-lubridate 类需要 lubridate 包

标签： r chi-squared

【解决方案1】：

我想我明白目标是什么。我复制了数据集，因为Region 中只有一个值。

require(tidyverse)

df <- agent_data_clean_coded %>%
  bind_rows(agent_data_clean_coded %>% mutate(Region = "other"))

result <- df %>% select(Region, starts_with("Answer")) %>%
  gather(question, answer,-Region) %>%
  group_by(question) %>%
  nest() %>%
  mutate(M = map(data, function(dat){
    dat2 <- dat %>% 
      group_by(Region,answer) %>%
      summarise(freq = n()) %>% 
      spread(Region, freq)
    M <- as.matrix(dat2[, -1])
    row.names(M) <- dat2$answer
    return(M)
  }))

我习惯于以不同的方式解决这个问题：对于这个选项，我还排除了两个 Answer_Top 变量，因为它们也包含一个级别。否则 chisq.test 会报错。在这种情况下，我将原始数据与chisq.test 一起使用，而不是列联表。

result2 <- df %>% select(Region, starts_with("Answer")) %>%
  select(-contains("Top")) %>%
  gather(question, answer,-Region) %>%
  group_by(question) %>%
  nest() %>%
  mutate(test = map(data, ~
    chisq.test(.x$Region,.x$answer))) %>%
  mutate(p = map_dbl(test, pluck,'p.value'))

【讨论】：

谢谢@Wietze314！这让我大约 90% 到达了我想要的地方；我会将我的最终代码 sn-p/output 添加到原始问题中。感谢您也向我展示了第二种方式；绝对看起来比我第一次尝试做的方式更干净。