【发布时间】:2021-04-19 13:38:59
【问题描述】:
我正在进行间断时间序列回归分析,以帮助查看在特定事件之后值是否存在显着的非零变化。这是两个模拟数据框;一个具有日期和值,另一个具有事件名称和该事件的相应日期:-
#dataset 1
eventDate<-structure(c(18262, 18263, 18264, 18265, 18266, 18267, 18268,
18269, 18270, 18271, 18272, 18273, 18274, 18275, 18276, 18277,
18278, 18279, 18280, 18281, 18282, 18283, 18284, 18285, 18286,
18287, 18288, 18289, 18290, 18291, 18292, 18293, 18294, 18295,
18296, 18297, 18298, 18299, 18300, 18301, 18302, 18303, 18304,
18305, 18306, 18307, 18308, 18309, 18310, 18311, 18312, 18313,
18314, 18315, 18316, 18317, 18318, 18319, 18320, 18321, 18322,
18323, 18324, 18325, 18326, 18327, 18328, 18329, 18330, 18331,
18332, 18333, 18334, 18335, 18336, 18337, 18338, 18339, 18340,
18341, 18342, 18343, 18344, 18345, 18346, 18347, 18348, 18349,
18350, 18351, 18352, 18353, 18354, 18355, 18356, 18357, 18358,
18359, 18360, 18361, 18362, 18363, 18364, 18365, 18366, 18367,
18368, 18369, 18370, 18371, 18372, 18373, 18374, 18375, 18376,
18377, 18378, 18379, 18380, 18381, 18382, 18383, 18384, 18385,
18386, 18387, 18388, 18389, 18390, 18391, 18392, 18393, 18394,
18395, 18396, 18397, 18398, 18399, 18400, 18401, 18402, 18403,
18404, 18405, 18406, 18407, 18408, 18409, 18410, 18411, 18412,
18413, 18414, 18415, 18416, 18417, 18418, 18419, 18420, 18421,
18422, 18423, 18424, 18425, 18426, 18427, 18428, 18429, 18430,
18431, 18432, 18433, 18434, 18435, 18436, 18437, 18438, 18439,
18440, 18441, 18442, 18443, 18444, 18445, 18446, 18447, 18448,
18449, 18450, 18451, 18452, 18453, 18454, 18455, 18456, 18457,
18458, 18459, 18460, 18461, 18462, 18463, 18464, 18465, 18466,
18467, 18468, 18469, 18470, 18471, 18472, 18473, 18474, 18475,
18476, 18477, 18478, 18479, 18480, 18481, 18482, 18483, 18484,
18485, 18486, 18487, 18488, 18489, 18490, 18491, 18492, 18493,
18494, 18495, 18496, 18497, 18498, 18499, 18500, 18501, 18502,
18503, 18504, 18505, 18506, 18507, 18508, 18509, 18510, 18511,
18512, 18513, 18514, 18515, 18516, 18517, 18518, 18519, 18520,
18521, 18522, 18523, 18524, 18525, 18526, 18527, 18528, 18529,
18530, 18531, 18532, 18533, 18534, 18535, 18536, 18537, 18538,
18539, 18540, 18541, 18542, 18543, 18544, 18545, 18546, 18547,
18548, 18549, 18550, 18551, 18552, 18553, 18554, 18555, 18556,
18557, 18558, 18559, 18560, 18561, 18562, 18563, 18564, 18565,
18566, 18567, 18568, 18569, 18570, 18571, 18572, 18573, 18574,
18575, 18576, 18577, 18578, 18579, 18580, 18581, 18582, 18583,
18584, 18585, 18586, 18587, 18588, 18589, 18590, 18591, 18592,
18593, 18594, 18595, 18596, 18597, 18598, 18599, 18600, 18601,
18602, 18603, 18604, 18605, 18606, 18607, 18608, 18609, 18610,
18611, 18612, 18613, 18614, 18615, 18616, 18617, 18618, 18619,
18620, 18621, 18622, 18623, 18624, 18625, 18626, 18627), class = "Date")
Count<-c(46L, 58L, 46L, 60L, 42L, 56L, 44L, 60L, 48L, 43L, 50L, 45L,
55L, 57L, 47L, 46L, 42L, 44L, 43L, 58L, 60L, 58L, 50L, 55L, 51L,
43L, 60L, 51L, 51L, 59L, 44L, 44L, 42L, 48L, 50L, 60L, 53L, 57L,
56L, 60L, 52L, 43L, 50L, 55L, 49L, 53L, 50L, 48L, 45L, 51L, 59L,
56L, 53L, 45L, 52L, 122L, 100L, 91L, 82L, 60L, 55L, 58L, 42L,
53L, 59L, 47L, 58L, 54L, 56L, 49L, 46L, 41L, 48L, 40L, 59L, 43L,
46L, 59L, 47L, 51L, 54L, 46L, 46L, 53L, 50L, 51L, 57L, 48L, 60L,
59L, 46L, 53L, 50L, 44L, 42L, 58L, 55L, 59L, 57L, 42L, 52L, 43L,
54L, 47L, 53L, 44L, 48L, 42L, 56L, 59L, 46L, 49L, 47L, 52L, 58L,
42L, 52L, 41L, 55L, 56L, 58L, 52L, 43L, 40L, 56L, 47L, 46L, 50L,
45L, 54L, 53L, 50L, 53L, 48L, 58L, 40L, 43L, 55L, 41L, 46L, 46L,
55L, 46L, 46L, 52L, 59L, 59L, 46L, 43L, 59L, 57L, 57L, 41L, 40L,
44L, 47L, 55L, 44L, 54L, 58L, 56L, 43L, 58L, 45L, 53L, 42L, 57L,
59L, 42L, 40L, 53L, 60L, 58L, 40L, 59L, 54L, 41L, 59L, 48L, 48L,
43L, 47L, 50L, 50L, 53L, 50L, 43L, 41L, 43L, 51L, 53L, 40L, 52L,
43L, 53L, 51L, 51L, 49L, 53L, 40L, 45L, 59L, 50L, 60L, 60L, 42L,
47L, 47L, 45L, 50L, 46L, 60L, 40L, 48L, 43L, 59L, 58L, 55L, 48L,
44L, 53L, 60L, 52L, 54L, 42L, 44L, 52L, 51L, 47L, 53L, 45L, 41L,
56L, 45L, 56L, 52L, 57L, 48L, 47L, 52L, 58L, 51L, 41L, 53L, 155L,
123L, 98L, 90L, 84L, 71L, 58L, 50L, 54L, 45L, 58L, 48L, 49L,
60L, 41L, 60L, 46L, 40L, 50L, 49L, 57L, 58L, 56L, 58L, 51L, 53L,
41L, 45L, 58L, 51L, 50L, 56L, 42L, 59L, 42L, 53L, 57L, 52L, 50L,
43L, 43L, 59L, 41L, 54L, 56L, 45L, 42L, 55L, 50L, 58L, 48L, 54L,
41L, 42L, 44L, 46L, 50L, 40L, 51L, 42L, 55L, 44L, 54L, 51L, 45L,
58L, 40L, 46L, 46L, 40L, 57L, 53L, 40L, 49L, 52L, 50L, 50L, 59L,
42L, 57L, 55L, 52L, 57L, 52L, 43L, 48L, 48L, 40L, 42L, 48L, 41L,
47L, 53L, 40L, 50L, 60L, 43L, 60L, 51L, 56L, 53L, 42L, 52L, 56L,
48L, 44L, 44L, 53L, 60L, 55L, 56L, 45L, 41L, 54L, 43L, 59L, 48L,
46L, 50L, 47L, 56L, 57L)
data<-data.frame(eventDate,Count)
#dataset 2
Event<-c("event_a", "event_b", "event_c", "event_d", "event_e", "event_f",
"event_g")
Event_Date<-structure(c(18289, 18317, 18358, 18444, 18506, 18528, 18547), class = "Date")
events_dates<-data.frame(Event, Event_Date)
以下是我正在做的两个示例。一个例子有显着的结果,一个例子有一个不显着的结果。
library(dplyr)
library(ggplot2)
#significant result
event_date<-as.Date("2020-09-01")
before_period<-as.Date(event_date)-21
after_period<-as.Date(event_date)+21
data_filtered<-data%>%
filter(eventDate>=as.Date(before_period) & eventDate<=as.Date(after_period))
data_filtered<-data_filtered%>%
mutate(DayNumber=row_number())
data_filtered$intv_trend <- cumsum(data_filtered$eventDate >= as.Date(event_date))
data_filtered$Post_event<-ifelse(data_filtered$eventDate<event_date,0,1)
data_filtered <- data_filtered %>%
mutate(lag_count = lag(Count))
data_filtered
fit <- glm(Count ~ DayNumber+ Post_event + intv_trend+log(lag_count),
family = "poisson", data = data_filtered)
summary(fit)
data_filtered$fit <- exp(c(NA, predict(fit)))
data_filtered$fit2 = c(NA, predict(fit, type="response"))
data_filtered$Group<-ifelse(data_filtered$Post_event==0, "Pre-event","Post-event")
data_filtered$Group<-factor(data_filtered$Group, levels = c("Pre-event","Post-event"))
ggplot(data_filtered, aes(x=DayNumber, y = Count, colour=Group)) +
geom_line()+geom_smooth(method="lm", se=F, aes(colour=Group)) +ggtitle("Count before and after event")+
geom_vline(xintercept = 22,linetype="dotted")+labs(caption = "Dotted line represents time of event")
ggplot(data_filtered, aes(x = DayNumber, y = fit2)) +
geom_line() +
geom_smooth(method="lm", se=F, aes(colour=Group)) +
theme_bw() +
labs(colour="")+ggtitle("Count (Fitted); Method=lm")+
geom_vline(xintercept = 22,linetype="dotted")+labs(caption = "Dotted line represents time of event")
#non-significant result
event_date<-as.Date("2020-01-28")
before_period<-as.Date(event_date)-21
after_period<-as.Date(event_date)+21
data_filtered<-data%>%
filter(eventDate>=as.Date(before_period) & eventDate<=as.Date(after_period))
data_filtered<-data_filtered%>%
mutate(DayNumber=row_number())
data_filtered$intv_trend <- cumsum(data_filtered$eventDate >= as.Date(event_date))
data_filtered$Post_event<-ifelse(data_filtered$eventDate<event_date,0,1)
data_filtered <- data_filtered %>%
mutate(lag_count = lag(Count))
data_filtered
fit <- glm(Count ~ DayNumber+ Post_event + intv_trend+log(lag_count),
family = "poisson", data = data_filtered)
summary(fit)
data_filtered$fit <- exp(c(NA, predict(fit)))
data_filtered$fit2 = c(NA, predict(fit, type="response"))
data_filtered$Group<-ifelse(data_filtered$Post_event==0, "Pre-event","Post-event")
data_filtered$Group<-factor(data_filtered$Group, levels = c("Pre-event","Post-event"))
ggplot(data_filtered, aes(x=DayNumber, y = Count, colour=Group)) +
geom_line()+geom_smooth(method="lm", se=F, aes(colour=Group)) +ggtitle("Count before and after event")+
geom_vline(xintercept = 22,linetype="dotted")+labs(caption = "Dotted line represents time of event")
ggplot(data_filtered, aes(x = DayNumber, y = fit2)) +
geom_line() +
geom_smooth(method="lm", se=F, aes(colour=Group)) +
theme_bw() +
labs(colour="")+ggtitle("Count (Fitted); Method=lm")+
geom_vline(xintercept = 22,linetype="dotted")+labs(caption = "Dotted line represents time of event")
每个event_date 变量都来自event_dates 数据框。
我有一长串想要进行分析的日期,所以我需要一个函数来帮助我有效地完成这项工作。我希望它返回回归拟合摘要和每个事件的两个图。到目前为止,这是我的尝试(如您所知,不是一个好的尝试):-
#data1 represents data, data2 represents events_dates, n_days is number of days before and after event_date that I want to filter from data
TS_Intervention_Func<-function(data1,data2,n_days){
myresultslist<-list()
for (i in data2$Event_Date) {
event_date<-as.Date(i)
before_period<-as.Date(event_date)-n_days
after_period<-as.Date(event_date)+n_days
data_filtered<-data1%>%
filter(eventDate>=as.Date(before_period) & eventDate<=as.Date(after_period))
data_filtered<-data_filtered%>%
mutate(DayNumber=row_number())
data_filtered$intv_trend <- cumsum(data_filtered$eventDate >= as.Date(event_date))
data_filtered$Post_event<-ifelse(data_filtered$eventDate<event_date,0,1)
data_filtered <- data_filtered %>%
mutate(lag_count = lag(Count))
fit <- glm(Count ~ DayNumber+ Post_event + intv_trend+log(lag_count),
family = "poisson", data = data_filtered)
results<-summary(fit)
data_filtered$fit <- exp(c(NA, predict(fit)))
data_filtered$fit2 = c(NA, predict(fit, type="response"))
data_filtered$Group<-ifelse(data_filtered$Post_event==0, "Pre-event","Post-event")
data_filtered$Group<-factor(data_filtered$Group, levels = c("Pre-event","Post-event"))
plot_1<-ggplot(data_filtered, aes(x=DayNumber, y = Count, colour=Group)) +
geom_line()+geom_smooth(method="lm", se=F, aes(colour=Group)) +ggtitle("Count before and after event",paste0(event_date))+
geom_vline(xintercept = 22,linetype="dotted")+labs(caption = "Dotted line represents time of event")
plot_2<-ggplot(data_filtered, aes(x = DayNumber, y = fit2)) +
geom_line() +
geom_smooth(method="lm", se=F, aes(colour=Group)) +
theme_bw() +
labs(colour="")+ggtitle("Count (Fitted); Method=lm", paste0(event_date))+
geom_vline(xintercept = 22,linetype="dotted")+labs(caption = "Dotted line represents time of event")
myresultslist[[i]] <- do.call(results,plot1,plot_2)
}
return(myresultslist)
}
TS_Intervention_Func(data,events_dates,21)
#Error in as.Date.numeric(i) : 'origin' must be supplied
简而言之,我希望函数做三件事:-
- 从
event_dates数据框中获取每个日期,并在data中对该日期运行分析迭代 - 将
fit摘要和两个对应的图表存储到该事件日期并将其保存在列表中(如果可以将事件名称与它一起保存以便于查找,那就更好了) - 这是可取的;如果列表可以分为两部分,一个保存重要结果,另一个保存非重要结果。
一个很大的问题,但我们一如既往地感谢任何帮助:)
【问题讨论】:
标签: r list function for-loop regression