【问题标题】:R Create a time sequence as xts index based on two columns in data.frameR基于data.frame中的两列创建时间序列作为xts索引
【发布时间】:2017-08-04 21:11:05
【问题描述】:

我有一个如下所示的 data.frame

    soc_sec group_count total_creds group_start  group_end
       (chr)       (int)       (dbl)      (date)     (date)
1  AA2105480           5        14.0  2005-01-09 2005-05-16
2  AA2105480           7        17.0  2004-08-26 2004-12-10
3  AB4378973           1         0.0  2004-01-21 2004-05-07
4  AB4990257           2         1.0  2014-09-01 2014-12-14
5  AB7777777           5        12.0  2004-01-21 2005-03-22
6  AB7777777           6        15.0  2004-08-26 2004-12-10
7  AB7777777           5        15.0  2005-01-09 2005-05-12
8  AC4285291           2         3.0  2014-09-01 2014-12-14
9  AC4285291           1         3.0  2015-01-12 2015-04-15
10 AC6039874           9        17.5  2010-01-06 2010-05-06
11 AC6039874           7        16.0  2011-01-05 2011-04-29
12 AC6039874           8        12.5  2010-08-31 2010-12-21
13 AC6039874           7        13.5  2011-08-31 2011-12-21
14 AC6547645           7        18.0  2005-01-09 2005-05-12
15 AC6547645           6        17.0  2004-08-26 2004-12-10
16 AC6547645           1         2.0  2005-04-20 2005-06-01
17 AD1418577           7        13.0  2013-01-09 2013-05-17
18 AD1418577           8        16.0  2013-08-28 2013-12-13
19 AD1418577           6        15.0  2014-01-08 2014-05-05
20 AD1418577           7        13.0  2015-08-26 2015-12-15

我要做的是创建一个列,以后可以根据group_startgroup_end 之间的天数顺序将其用作xts 对象的逐日索引。我知道我可以使用v <- seq(df$group_start[1], df$group_end[1], by="days") 计算一列的向量我什至可以对行进行必要的重复,我以后可以dplyr::bind_rows(df,v) 使用:

df$len <- apply(df, 1, function(x){
    length(seq(as.Date(x["group_start"]), as.Date(x["group_end"]), by="days"))
   })
df <- df[rep(seq_len(nrow(df)), df$len),]

我一直无法做的是矢量化这发生在 data.frame 中的每一行。

我尝试过的方法不起作用

create_date_vector <- function(x){
   flog.debug("id: %s", x["soc_sec"])
   seq(as.Date(x["group_start"]), as.Date(x["group_end"]), by = "days")
 }
 date_vec <- c()
 date_vec <- c(date_vec, apply(df, 1, create_date_vector))

错误:Error in seq.int(0, to0 - from, by) : wrong sign in 'by' argument

date_vec <- c()
for(i in 1:nrow(df)){
      date_vec <- c(date_vec, seq(from=as.Date(df$group_start[as.integer(i)]), to=as.Date(df$group_end[as.integer(i)])), by="days")
    }

错误:Error in seq.Date(from = as.Date(ags_df$group_start[as.integer(i)]), to = as.Date(ags_df$group_end[as.integer(i)])) : exactly two of 'to', 'by' and 'length.out' / 'along.with' must be specified

任何帮助将不胜感激。谢谢。

投入

structure(list(soc_sec = c("AA2105480", "AA2105480", "AB4378973", 
"AB4990257", "AB7777777", "AB7777777", "AB7777777", "AC4285291", 
"AC4285291", "AC6039874", "AC6039874", "AC6039874", "AC6039874", 
"AC6547645", "AC6547645", "AC6547645", "AD1418577", "AD1418577", 
"AD1418577", "AD1418577"), group_count = c(5L, 7L, 1L, 2L, 5L, 
6L, 5L, 2L, 1L, 9L, 7L, 8L, 7L, 7L, 6L, 1L, 7L, 8L, 6L, 7L), 
    total_creds = c(14, 17, 0, 1, 12, 15, 15, 3, 3, 17.5, 16, 
    12.5, 13.5, 18, 17, 2, 13, 16, 15, 13), group_start = structure(c(12792, 
    12656, 12438, 16314, 12438, 12656, 12792, 16314, 16447, 14615, 
    14979, 14852, 15217, 12792, 12656, 12893, 15714, 15945, 16078, 
    16673), class = "Date"), group_end = structure(c(12919, 12762, 
    12545, 16418, 12864, 12762, 12915, 16418, 16540, 14735, 15093, 
    14964, 15329, 12915, 12762, 12935, 15842, 16052, 16195, 16784
    ), class = "Date")), class = c("tbl_df", "data.frame"), row.names = c(NA, 
-20L), .Names = c("soc_sec", "group_count", "total_creds", "group_start", 
"group_end"))

【问题讨论】:

    标签: r dplyr sequence xts


    【解决方案1】:

    在您找到可行的解决方案一个多月后,我不知道这有多大用处,但我尝试将您的代码精简为更紧凑的东西。

    library(dplyr)
    
    df <- structure(list(soc_sec = c("AA2105480", "AA2105480", "AB4378973", 
    "AB4990257", "AB7777777", "AB7777777", "AB7777777", "AC4285291", 
    "AC4285291", "AC6039874", "AC6039874", "AC6039874", "AC6039874", 
    "AC6547645", "AC6547645", "AC6547645", "AD1418577", "AD1418577", 
    "AD1418577", "AD1418577"), group_count = c(5L, 7L, 1L, 2L, 5L, 
    6L, 5L, 2L, 1L, 9L, 7L, 8L, 7L, 7L, 6L, 1L, 7L, 8L, 6L, 7L), 
        total_creds = c(14, 17, 0, 1, 12, 15, 15, 3, 3, 17.5, 16, 
        12.5, 13.5, 18, 17, 2, 13, 16, 15, 13), group_start = structure(c(12792, 
        12656, 12438, 16314, 12438, 12656, 12792, 16314, 16447, 14615, 
        14979, 14852, 15217, 12792, 12656, 12893, 15714, 15945, 16078, 
        16673), class = "Date"), group_end = structure(c(12919, 12762, 
        12545, 16418, 12864, 12762, 12915, 16418, 16540, 14735, 15093, 
        14964, 15329, 12915, 12762, 12935, 15842, 16052, 16195, 16784
        ), class = "Date")), .Names = c("soc_sec", "group_count", 
    "total_creds", "group_start", "group_end"), class = c("tbl_df", 
    "data.frame"), row.names = c(NA, -20L))
    
    
    # Essentially the same as the calc_day_nums() and apply() part of
    # your solution. It returns an object of class difftime, but that
    # doesn't seem to cause any problems
    diffs <- abs(with(df, group_start-group_end))+1
    
    # This will repeat row[i] diffs[i] number of times
    df.rep <- df[rep(1:nrow(df), times=diffs), ]
    reps <- rep(diffs, times=diffs)
    
    # Creating the time sequences. Many ways to skin this cat, I suspect.
    # This is but one
    dates.l <- apply(
      df[colnames(df) %in% c("group_start", "group_end")], 1, 
      function(x) {
            seq(min(as.Date(x)), max(as.Date(x)), by="days")
      })
    
    # Converting the list into one long vector. Essentially the same as
    # unlist(), except it retains the Date class.
    days <- do.call(c, dates.l)
    
    # Combining the elements by column
    df.long <- cbind(df.rep, reps, days)
    
    str(df.long)
    
    # dplyr isn't exactly my forte. This is just to convert the output
    # into the same tbl format as the input
    library(tibble) 
    df.long <- as_tibble(df.long)
    

    【讨论】:

    • 感谢您的解决方案。我特别喜欢保留 Date 类型的 do.call。很好的提示。
    【解决方案2】:

    所以,我设法弄明白了,我想我应该把解决方案放在这里以防万一。它需要多个步骤,所以如果有人能想到更好的方法,请告诉我。

    首先,我创建了一个列来计算两个日期之间的天数。我需要这个,以便我知道每行要重复多少次

    calc_day_nums <- function(x){
      if(as.numeric(as.Date(x["group_start"])) < as.numeric(as.Date(x["group_end"]))){
        len <- length(seq(as.Date(x["group_start"]), as.Date(x["group_end"]), by="days"))
      } else if (as.numeric(as.Date(x["group_start"])) > as.numeric(as.Date(x["group_end"]))){
        len <- length(seq(as.Date(x["group_end"]), as.Date(x["group_start"]), by="days"))
      } else {
        len <- 1 #basically these are records whose start and end are the same
      }
      return(len)
    }
    df$reps <- apply(df, 1, calc_day_nums)
    

    然后,我自己创建了所有日子的向量

    date_vec <- function(i, x, y){
      if(as.Date(x[i]) != as.Date(y[i])){
        as.Date(as.Date(x[i]):as.Date(y[i]), origin="1970-01-01")
      } else{
        as.Date(x[i])
      }
    }
    vec <- lapply(seq_along(df$group_start), date_vec, x=df$group_start, y=df$group_end)
    vec <- unlist(vec)
    vec <- as.Date(vec)
    

    之后,我对 data.frame 进行了正确的行重复次数

    df <- df[rep(seq_len(nrow(df)), df$reps),]
    

    最后,我将向量绑定到 data.frame。此时我也可以将vec 定义为xts 索引xt &lt;- xts(x = df, order.by = vec),但我想将其添加到data.frame

    df <- bind_cols(df, data.frame(days=vec))
    

    【讨论】:

      猜你喜欢
      • 2012-03-11
      • 2017-10-29
      • 2013-02-12
      • 2017-02-14
      • 1970-01-01
      • 2015-02-09
      • 2018-04-03
      • 2015-03-24
      • 2023-03-09
      相关资源
      最近更新 更多