【发布时间】:2019-09-17 17:26:41
【问题描述】:
我正在尝试编写一个函数,该函数将对现有数据框中的列进行分组,并对每列的关联矩阵/列联表进行卡方检验,然后报告每个检验的 p 值。我一直在尝试模仿使用here 的方法,但我发现我的 M 值没有像我认为我需要的那样被格式化为矩阵。我不确定是不是因为我的列比上面链接中的示例多,或者我只是遗漏了一些东西,但这是我目前拥有的数据结构示例:
require(lubridate)
structure(list(ResponseID = c("R_2fpKxLYlxAoplxP", "R_enci4Hwwee9XLSp",
"R_332X6CmsgY6RE5s", "R_3GAI7CSx4a74LVp", "R_2QXRRBh4UCFoHDl",
"R_3gSKU8piHOKWf9E"), region = structure(c(1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Lakeland", "Macon SE", "Other"), class = "factor"),
InCalls_Qrtl = structure(c(7L, 7L, 7L, 7L, 7L, 7L), .Label = c("NA",
"No EDGE Calls", "Bottom Quartile", "Second Quartile", "Third Quartile",
"Top Quartile", "Missing"), class = "factor"), InAHT_Qrtl = structure(c(7L,
7L, 7L, 7L, 7L, 7L), .Label = c("NA", "No EDGE Calls", "Bottom Quartile",
"Second Quartile", "Third Quartile", "Top Quartile", "Missing"
), class = "factor"), InHold_Qrtl = structure(c(7L, 7L, 7L,
7L, 7L, 7L), .Label = c("NA", "No EDGE Calls", "Bottom Quartile",
"Second Quartile", "Third Quartile", "Top Quartile", "Missing"
), class = "factor"), Overall_Tenure_Period = new("Period",
.Data = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), year = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), month = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), day = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), hour = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), minute = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), Recent_Tenure_Period = new("Period",
.Data = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), year = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), month = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), day = c(NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_), hour = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), minute = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)), VOA_Overall_Tenure_Orig = structure(c(9L,
9L, 9L, 9L, 9L, 9L), .Label = c("< 1 Year", "1 Year - <2 Years",
"2-5 Years", "6-10 Years", "11-15 Years", "16-20 Years",
"21-25 Years", "26 Years or Longer", "Missing"), class = "factor"),
VOA_Overall_Tenure_Mod = structure(c(5L, 5L, 5L, 5L, 5L,
5L), .Label = c("<2 Years", "2-5 Years", "6-10 Years", ">10 Years",
"Missing"), class = "factor"), VOA_Recent_Tenure_Orig = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), VOA_Recent_Tenure_Mod = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_), Region = c("Lakeland", "Lakeland", "Lakeland",
"Lakeland", "Lakeland", "Lakeland"), Tenure_Code_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Tenure_Code_Most = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Tenure_Code_Split = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Top = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InCalls_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Top = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InAHT_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Least = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Top = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), InHold_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Question = c("Overall, how satisfied are you with using EDGE?",
"Overall, how satisfied are you with using EDGE?", "Overall, how satisfied are you with using EDGE?",
"Overall, how satisfied are you with using EDGE?", "Overall, how satisfied are you with using EDGE?",
"Overall, how satisfied are you with using EDGE?"), Answer = c("Slightly Satisfied",
"Slightly Satisfied", "Slightly Satisfied", "Dissatisfied",
"Completely Dissatisfied", "Slightly Dissatisfied"), Answer_TopBox = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Answer_Top2 = c("Rest",
"Rest", "Rest", "Rest", "Rest", "Rest"), Answer_GenSat = c("GenSat",
"GenSat", "GenSat", "Rest", "Rest", "Rest"), Answer_Bottom = c("Rest",
"Rest", "Rest", "Rest", "Bottom", "Rest"), Answer_Bottom2 = c("Rest",
"Rest", "Rest", "Bottom2", "Rest", "Rest"), Answer_GenDissat = c("Rest",
"Rest", "Rest", "GenDissat", "GenDissat", "GenDissat")), row.names = c(NA,
6L), class = "data.frame")
然后我尝试通过执行以下操作重新创建示例:
top_score_tests_agent <- as.data.frame(agent_data_clean_coded %>%
group_by(Region, Tenure_Code_Least, Tenure_Code_Most, Tenure_Code_Split,InCalls_Least, InCalls_Top, InCalls_Top2, InAHT_Least,
InAHT_Top, InAHT_Top2, InHold_Least, InHold_Top, InHold_Top2, Question, Answer_TopBox) %>%
summarise(freq = n())) %>% group_by(Tenure_Code_Least, Tenure_Code_Most, Tenure_Code_Split,InCalls_Least, InCalls_Top, InCalls_Top2, InAHT_Least,
InAHT_Top, InAHT_Top2, InHold_Least, InHold_Top, InHold_Top2, Question) %>%
nest() %>%
mutate(M = map(data, function(dat){
dat2 <- dat %>% spread(Region, freq)
M <- as.matrix(dat2[, -1])
row.names(M) <- dat2$Answer_TopBox
return(M)
}))
但我发现,如果我尝试通过检查 top_score_tests_agent$M[[1]] 结果来检查矩阵创建,我会得到以下输出:
structure(4L, .Dim = c(1L, 1L), .Dimnames = list(NULL, "Top Box"))
我只是想知道是否有人对我做错了什么会阻止我的矩阵创建有任何见解,或者是否有人有任何其他他们曾经这样做过的方法?
编辑
我能够使用@Wietze314 编写的大部分代码,但对于任何有兴趣查看最终代码的未来用户:
result2 <- df %>% select(Region, Question, starts_with("Answer")) %>%
gather(segment, answer,-Region, -Question) %>%
group_by(Question, segment) %>%
nest() %>%
mutate(test = map(data, ~chisq.test(.x$Region,.x$answer, correct=FALSE))) %>%
mutate(p = map_dbl(test, pluck,'p.value'),
Status = ifelse(p<=0.01, "99% Sig Difference", ifelse(
p>0.01 & p<=0.05, "95% Sig Difference", ifelse(
p>0.05 & p<=0.1, "90% Sig Difference", "Not Significant")))) %>%
select(-data, -test)
这给了我一个看起来像这样的输出:
structure(list(Question = c("I feel comfortable 'trusting the system' with EDGE",
"EDGE allows me to be more efficient", "The E-learning training (GU Courses)",
"When I have questions about EDGE, I feel confident they will be answered",
"Overall, the training I received prepared me to use EDGE", "Overall, how satisfied are you with using EDGE?",
"The in-person, instructor-led training", "The formal training you received in EDGE",
"EDGE allows me to be more efficient", "I feel comfortable 'trusting the system' with EDGE"
), Segment = c("Answer_GenDissat", "Answer_Bottom", "Answer_GenSat",
"Answer_TopBox", "Answer_GenDissat", "Answer_TopBox", "Answer_Top2",
"Answer_Top2", "Answer_Bottom2", "Answer_GenSat"), pvalue = c(0.231403084430793,
0.299890413606335, 0.00108798852510237, 0.487810952072342, 0.131641662666334,
0.31818165042123, 0.501077891603077, 0.634730681199174, 0.389259022098406,
0.274277276570632), Status = c("Not Significant", "Not Significant",
"99% Sig Difference", "Not Significant", "Not Significant", "Not Significant",
"Not Significant", "Not Significant", "Not Significant", "Not Significant"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-10L))
【问题讨论】:
-
从您的数据和问题来看,您并不清楚您要与 chisq.test 比较哪些变量以及数据的哪些子集。您链接的示例使用长数据格式,并针对每个
ecotype和contigID组合比较变量allele和sex。 -
我不明白这个问题。该结构对象是一个矩阵。
is.matrix( structure(4L, .Dim = c(1L, 1L), .Dimnames = list(NULL, "Top Box")) ) #[1] TRUE -
此外,将更大的结构分配给对象名称的努力失败了:
Error in getClass(Class, where = topenv(parent.frame())) : “Period” is not a defined class -
抱歉,我要比较的变量是 Region/Tenure/InCall/InAHT/InHold 变量与不同的 Answer 变量。例如,应将 Region 字段分组为 Answer_TopBox、Answer_Top2、Answer_GenSat 等的 2x2 矩阵,这样我最终将得到总共 6 个矩阵,可用于进行 chi sq 测试。
-
@42-
lubridate类需要lubridate包
标签: r chi-squared