【问题标题】:dplyr and multiple linear modelsdplyr 和多个线性模型
【发布时间】:2015-04-10 05:47:38
【问题描述】:

我正在尝试使用 dplyr 对个人 ID (cusip) 和年份 (fyear) 运行大量回归,但我不确定如何使用 summary 函数。我需要运行模型,获取系数,将它们相加,然后将mutate 结果添加到另一个变量beta。这是一些代码,它不起作用,但可以理解。

可重现的例子:

  tdata <- structure(list(cusip = c("02136810", "02136810", "02136810", 
"02136810", "02136810", "02136810", "02136810", "02136810", "02136810", 
"02136810", "02136810", "02136810", "02136810", "02136810", "02136810", 
"02136810", "02136810", "02136810", "02136810", "02136810", "02136810", 
"02136810", "02136810", "02136810", "01650910", "01650910", "01650910", 
"01650910", "01650910", "01650910"), fyear = c(1979L, 1979L, 
1979L, 1979L, 1979L, 1979L, 1979L, 1979L, 1979L, 1979L, 1979L, 
1979L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 
1980L, 1980L, 1980L, 1980L, 1965L, 1965L, 1965L, 1965L, 1965L, 
1965L), ret = c("0.000000", "0.000000", "0.111111", "-0.063636", 
"0.203883", "0.032258", "0.078125", "0.000000", "-0.014493", 
"-0.014706", "0.044776", "0.457143", "0.039216", "-0.009434", 
"-0.200000", "-0.047619", "0.100000", "0.022727", "0.144444", 
"0.067961", "-0.009091", "0.009174", "0.109091", "-0.077869", 
"0.418182", "-0.089744", "0.014085", "-0.041667", "-0.086957", 
"0.000000"), vwretd = c(0.049489, -0.026766, 0.065618, 0.008522, 
-0.013576, 0.04685, 0.014991, 0.064728, 0.001428, -0.07266, 0.063603, 
0.028212, 0.065607, 0.001015, -0.120224, 0.052288, 0.06009, 0.037714, 
0.069438, 0.023553, 0.029498, 0.020093, 0.104951, -0.034409, 
0.038646, 0.006946, -0.009715, 0.033652, -0.00435, -0.051868), 
    date = c(19790131L, 19790228L, 19790330L, 19790430L, 19790531L, 
    19790629L, 19790731L, 19790831L, 19790928L, 19791031L, 19791130L, 
    19791231L, 19800131L, 19800229L, 19800331L, 19800430L, 19800530L, 
    19800630L, 19800731L, 19800829L, 19800930L, 19801031L, 19801128L, 
    19801231L, 19650129L, 19650226L, 19650331L, 19650430L, 19650528L, 
    19650630L)), .Names = c("cusip", "fyear", "ret", "vwretd", 
"date"), row.names = c(NA, 30L), class = "data.frame")

dplyr 代码:

test <- tdata %>% 
  group_by(cusip, fyear) %>% 
  arrange(desc(date) %>% 
  summary(fm <- lm(ret ~ vwretd + lag(vwretd), data = tdata)) %>% 
  mutate(beta <- summary(fm)$coefficients[2,1] + summary(fm)$coefficients[3,1])

编辑:

样本数据:https://www.dropbox.com/s/4padnsjjnt4uvy2/tdata.csv?dl=0

完整示例:https://www.dropbox.com/s/4padnsjjnt4uvy2/tdata.csv?dl=0

【问题讨论】:

标签: r dplyr


【解决方案1】:

我们可以使用do

library(dplyr)
tdata %>% 
     group_by(cusip, fyear) %>% 
     arrange(desc(date)) %>% 
     do({fm <- lm(ret~vwretd+lag(vwretd), data=.)
          data.frame(., beta=summary(fm)$coefficients[2,1]+ 
                            summary(fm)$coefficients[3,1])})

我们还可以将do 中的data.frame(., beta=....) 更改为

                    ---    %>%
      do({fm <- lm(ret~vwretd+lag(vwretd), data=.)    
          data.frame(., beta=sum(coef(fm)[-1]))})

更新

如果有单个观察的组组合,这将返回 NA for 'beta'

 tdata1 <- read.csv('tdata.csv', stringsAsFactors=FALSE)
 res <- tdata1 %>% 
             group_by(cusip, fyear) %>% 
             arrange(desc(date)) %>% 
             mutate(n=n()) %>%
             do(data.frame(., beta=ifelse(.$n > 1,
               sum(coef(lm(ret~vwretd+lag(vwretd), data=.))[-1]), NA)))


  as.data.frame(res)[1:3, c('date', 'cusip', 'ret','vwretd', 'beta')]
  #     date    cusip       ret    vwretd   beta
  #1 19691231 00080010 -0.012594 -0.019681 0.7932
  #2 19691128 00080010  0.001995 -0.032164 0.7932
  #3 19691031 00080010  0.113889  0.055638 0.7932

更新2

在完整的数据集上

tdata2 <-  read.csv('tdatafull.csv', stringsAsFactors=FALSE)
tdata2$ret <- as.numeric(tdata2$ret)
res1 <- tdata2%>%  group_by(cusip, fyear) %>% 
           arrange(desc(date)) %>% 
           mutate(n=n()) %>%
           do(data.frame(., beta=ifelse(.$n > 2,
             sum(coef(lm(ret~vwretd+lag(vwretd), data=.))[-1]), NA)))
 head(res1)
 #     X    cusip fyear       ret    vwretd     date  n     beta
 #1 728188 00003210  1973  0.000000  0.011425 19731231 12 2.751094
 #2 728187 00003210  1973 -0.300000 -0.120703 19731130 12 2.751094
 #3 728186 00003210  1973 -0.166667 -0.000427 19731031 12 2.751094
 #4 728185 00003210  1973  0.043478  0.053937 19730928 12 2.751094
 #5 728184 00003210  1973 -0.258065 -0.029648 19730831 12 2.751094
 #6 728183 00003210  1973  0.291667  0.056954 19730731 12 2.751094

 dim(tdata2)
#[1] 898657      6
sum(is.na(res1$beta))
#[1] 461

【讨论】:

  • 感谢您,但由于某种原因,我的主数据集出现错误。我已经用产生错误的一小部分数据更新了这个问题。有什么想法吗?
  • @Amstell 你能改变lm中的na.action
  • 我检查了任何 NA 并且不存在....这是我更改的内容,它说索引超出范围 do( {fm &lt;- lm(ret~vwretd+lag(vwretd), na.action=NULL, data=.); data.frame(., beta=summary(fm)$coefficients[2,1]+summary(fm)$coefficients[3,1])})
  • @Amstell 我会检查你的数据集。它必须与观察次数有关。
  • @Amstell 我发现 2 组只有一个观察结果
猜你喜欢
  • 1970-01-01
  • 2021-04-02
  • 1970-01-01
  • 2021-04-08
  • 1970-01-01
  • 2014-10-26
  • 2014-05-01
  • 1970-01-01
  • 1970-01-01
相关资源
最近更新 更多