【问题标题】:R: How to obtain difference in weeks between a “POSIXlt” date and the first occurrence of a "POSIXlt" date from the same vector.R:如何从同一向量中获取“POSIXlt”日期和“POSIXlt”日期的第一次出现之间的周差。
【发布时间】:2015-06-30 22:24:42
【问题描述】:

我有一个包含超过一百万行数据的数据框(代理和呼叫指标按天汇总)。每个代理都被多次列出,因为他们每天处理多个队列中的呼叫 (d1$Calls)。我想确定代理人在现场工作的周数。我通常可以使用“difftime”来获取任何给定日期的代理开始日期 (d1$Start) 和交互日期 (d1$Interaction) 之间的差异:

floor(difftime(d1$Interaction,d1$Start,units='weeks')) 

但是,我的系统的开始日期不可靠,通常会导致负数周数:

dput(d1)
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 2L), .Label = c("a123", "b123"), class = "factor"), QUEUE = structure(c(9L, 
8L, 7L, 6L, 5L, 3L, 4L, 1L, 2L, 4L), .Label = c("MHEK", "MMED", 
"MMEF", "MMEM", "MNEM", "MSED", "MSEE", "MSEK", "MSEP"), class = "factor"), 
Calls = c(1L, 4L, 25L, 14L, 6L, 25L, 5L, 1L, 1L, 3L), Interaction = structure(list(
sec = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), min = c(0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L), mday = c(2L, 2L, 6L, 12L, 
12L, 2L, 6L, 6L, 6L, 6L), mon = c(0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L), year = c(115L, 115L, 115L, 115L, 
115L, 115L, 115L, 115L, 115L, 115L), wday = c(5L, 5L, 
2L, 1L, 1L, 5L, 2L, 2L, 2L, 2L), yday = c(1L, 1L, 5L, 
11L, 11L, 1L, 5L, 5L, 5L, 5L), isdst = c(0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", "PST", 
"PST", "PST", "PST", "PST", "PST", "PST", "PST"), gmtoff = c(NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_)), .Names = c("sec", 
"min", "hour", "mday", "mon", "year", "wday", "yday", "isdst", 
"zone", "gmtoff"), class = c("POSIXlt", "POSIXt")), Start = structure(list(
sec = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), min = c(0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), hour = c(0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L), mday = c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L), mon = c(2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L), year = c(115L, 115L, 115L, 115L, 
115L, 115L, 115L, 115L, 115L, 115L), wday = c(0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), yday = c(59L, 59L, 59L, 
59L, 59L, 59L, 59L, 59L, 59L, 59L), isdst = c(0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), zone = c("PST", "PST", 
"PST", "PST", "PST", "PST", "PST", "PST", "PST", "PST"
), gmtoff = c(NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_, 
NA_integer_, NA_integer_)), .Names = c("sec", "min", 
"hour", "mday", "mon", "year", "wday", "yday", "isdst", "zone", 
"gmtoff"), class = c("POSIXlt", "POSIXt")), Weeks = structure(c(-9, 
-9, -8, -7, -7, -9, -8, -8, -8, -8), units = "weeks", class = "difftime")), .Names = c("ID", 
"QUEUE", "Calls", "Interaction", "Start", "Weeks"), row.names = c(NA, 
-10L), class = "data.frame")

为了解决这个问题,我想计算任何交互日期 (d1$Interaction) 与该代理在系统中的第一次交互日期 (d1$ID) 之间的周差。这怎么可能?

【问题讨论】:

    标签: r difftime


    【解决方案1】:

    这对我有用(都在基础 R 中):

    #split the data frame according to ID
    mylist <- split(df, factor(df$ID))
    
    #use do.call to combine lists elements to one data.frame
    #instead of do call you can use data.table::rbindlist for speed
    mydata <- do.call(rbind,
    lapply(mylist, function(x) {
                   #order each group
                   x <- x[order(x$Interaction),]
                   #calculate time differences
                   #difftime of Interactions vector from the 2nd element to the last, minus
                   #the Interactions vector of the 1st element to the penultimate
                   #I use c(0, difftime.... to add a zero to the first difference
                   #so that I can add it as a column
                   x$weekdif <- c(0,difftime(x$Interaction[2:length(x$Interaction)],  
                                x$Interaction[1:(length(x$Interaction)-1)],
                                units='weeks'))
                   x
    }))
    

    输出:

    > mydata
              ID QUEUE Calls Interaction      Start    Weeks   weekdif
    a123.1  a123  MSEP     1  2015-01-02 2015-03-01 -9 weeks 0.0000000
    a123.2  a123  MSEK     4  2015-01-02 2015-03-01 -9 weeks 0.0000000
    a123.3  a123  MSEE    25  2015-01-06 2015-03-01 -8 weeks 0.5714286
    a123.4  a123  MSED    14  2015-01-12 2015-03-01 -7 weeks 0.8571429
    a123.5  a123  MNEM     6  2015-01-12 2015-03-01 -7 weeks 0.0000000
    b123.6  b123  MMEF    25  2015-01-02 2015-03-01 -9 weeks 0.0000000
    b123.7  b123  MMEM     5  2015-01-06 2015-03-01 -8 weeks 0.5714286
    b123.8  b123  MHEK     1  2015-01-06 2015-03-01 -8 weeks 0.0000000
    b123.9  b123  MMED     1  2015-01-06 2015-03-01 -8 weeks 0.0000000
    b123.10 b123  MMEM     3  2015-01-06 2015-03-01 -8 weeks 0.0000000
    

    我将函数更改为以下内容,现在它可以按您的意愿工作了:

    #you need to import this for the na.locf function
    library(zoo)
    
    mylist <- split(df, factor(df$ID))
    mydata <- do.call(rbind,
    lapply(mylist, function(x) {
      x <- x[order(x$Interaction),]
      x$weekdif <- c(0,difftime(x$Interaction[2:length(x$Interaction)], x$Interaction[1:(length(x$Interaction)-1)], units='weeks'))
      #convert all zeros (apart from first to NAs)
      x$weekdif[x$weekdif==0] <- NA 
      #create the rolling values minus the first NAs
      #see the examples at ?na.locf for details on what it does
      temp <- as.numeric(na.locf(zoo(x$weekdif)))
      #add the first NAs
      missing_length <- length(x$weekdif) - length(temp)
      x$weekdif <- c(rep(0,missing_length), temp)
      x
    }))
    

    输出:

              ID QUEUE Calls Interaction      Start    Weeks   weekdif
    a123.1  a123  MSEP     1  2015-01-02 2015-03-01 -9 weeks 0.0000000
    a123.2  a123  MSEK     4  2015-01-02 2015-03-01 -9 weeks 0.0000000
    a123.3  a123  MSEE    25  2015-01-06 2015-03-01 -8 weeks 0.5714286
    a123.4  a123  MSED    14  2015-01-12 2015-03-01 -7 weeks 0.8571429
    a123.5  a123  MNEM     6  2015-01-12 2015-03-01 -7 weeks 0.8571429
    b123.6  b123  MMEF    25  2015-01-02 2015-03-01 -9 weeks 0.0000000
    b123.7  b123  MMEM     5  2015-01-06 2015-03-01 -8 weeks 0.5714286
    b123.8  b123  MHEK     1  2015-01-06 2015-03-01 -8 weeks 0.5714286
    b123.9  b123  MMED     1  2015-01-06 2015-03-01 -8 weeks 0.5714286
    b123.10 b123  MMEM     3  2015-01-06 2015-03-01 -8 weeks 0.5714286
    

    每个 id 的第一个值为 0,因为没有以前的交互日期。

    【讨论】:

    • 您知道为什么我在应用到我的实际数据帧时可能会收到此错误吗?
    • $&lt;-.data.frame(*tmp*, "weekdif", value = c(0, NA, 0)) 中的错误:替换有 3 行,数据有 1
    • 代码应该能够在任何具有交互列的数据框中运行。这是一个赋值错误,让我认为您从上面复制代码时出错了?没有看到实际数据或发生错误的地方很难调试......
    • 通过这个示例,您能告诉我为什么“weekdif”的第 4 (0.857) 行和第 5 (0.000) 行对于相同的 ID 和相同的交互日期是不同的吗?它们都应该是 0.857,不是吗?我需要每一行都有一个值。谢谢。
    • 因为它正在计算“任何交互日期(d1$Interaction)和第一次交互日期”之间的时间差(以周为单位)。因此,如果 ID 和交互日期相同,则相差 0 周。如果您希望在 ID 和交互日期相同的情况下使用之前的值填充所有行,您应该在问题中提及它。此外,大多数用户通常会欣赏您想要的输出。我稍后会努力解决这个问题。
    猜你喜欢
    • 2016-07-25
    • 1970-01-01
    • 2018-06-22
    • 1970-01-01
    • 2014-02-24
    • 2021-12-31
    • 2019-11-13
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多