【问题标题】:Combine the row values based on time interval根据时间间隔组合行值
【发布时间】:2021-03-08 10:43:55
【问题描述】:

我有一个这样的数据框

node <- c("ABCC","ABCC","ABCC","ABCC","ABCC","ABCC","ABCC","ABCC","ABCC")
activity <-c("NODE_ISOLATION","NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF","NODE_ISOLATION","LOSS_OF_MULTIPLEX_SECTION-OMS_A","NODE_ISOLATION","NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF","NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF", "UNDERLYING_RESOURCE_UNAVAILABLE-OMS_A","UNDERLYING_RESOURCE_UNAVAILABLE-OMS_A") 

e <-c("2020-05-08 16:11:58","2020-05-08 16:11:58","2020-05-08 16:30:07","2020-05-09 03:00:08","2020-05-09 03:08:08","2020-05-09 03:28:08","2020-05-09 13:08:08","2020-05-09 13:10:08","2020-05-09 13:28:08")

df <- data.frame(node, activity, e)
df

我想根据 30 分钟的时间间隔组合行值。 所需的输出如下

node <- c("ABCC","ABCC","ABCC")

activity <-c("NODE_ISOLATION,NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF,NODE_ISOLATION","LOSS_OF_MULTIPLEX_SECTION-OMS_A,NODE_ISOLATION,NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF","NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF,UNDERLYING_RESOURCE_UNAVAILABLE-OMS_A,UNDERLYING_RESOURCE_UNAVAILABLE-OMS_A") 

cnt <- c(3,3,3)
df1 <- data.frame(node, activity, cnt)
df1

【问题讨论】:

    标签: r dplyr


    【解决方案1】:

    这看起来像是时间差的累积总和,带有重置。

    设置threshold,这是组中的最大时差。在这种情况下,使用 30 分钟(30 * 60 = 1800 秒)。

    确保ePOSIXct

    函数cumsum_with_reset 将计算时间差的累积总和,一旦超过threshold,下一组重置为零。

    因此,使用此自定义函数的方法是:首先,group_by(node) 用于在每个 node 内进行的评估。计算行之间的差异diff。使用自定义函数确定每个不超过 30 分钟的group。然后,group_by 这个新的group,计算每个组的行数,并使用toStringactivity 放入集体逗号分隔值中。

    library(tidyverse)
    
    threshold <- 30 * 60 # 30 minutes * 60 seconds
    
    df$e <- as.POSIXct(df$e)
    
    cumsum_with_reset <- function(x, threshold) {
      cumsum <- 0
      group <- 0
      result <- numeric()
      for (i in seq_along(x)) {
        cumsum <- cumsum + x[i]
        if (cumsum >= threshold) {
          group <- group + 1
          cumsum <- 0
        }
        result = c(result, group)
      }
      return (result)
    }
    
    df %>%
      group_by(node) %>%
      mutate(diff = c(0, diff(e)),
             group = cumsum_with_reset(diff, threshold)) %>%
      group_by(group, .add = TRUE) %>%
      summarise(cnt = n(),
                activity = toString(activity)) %>%
      dplyr::select(-group)
    

    输出

      node    cnt activity                                                                                                         
      <chr> <int> <chr>                                                                                                            
    1 ABCC      3 NODE_ISOLATION, NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF, NODE_ISOLATION                                              
    2 ABCC      3 LOSS_OF_MULTIPLEX_SECTION-OMS_A, NODE_ISOLATION, NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF                             
    3 ABCC      3 NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF, UNDERLYING_RESOURCE_UNAVAILABLE-OMS_A, UNDERLYING_RESOURCE_UNAVAILABLE-OMS_A
    

    【讨论】:

    • 非常感谢您的贡献。我已经达到了期望的输出。
    • 我有一个问题是任何两个活动同时发生如何修改?
    • 您能进一步澄清一下吗?也许用例子更新你的问题?如果同时进行 2 个活动,您希望发生什么?
    • 您的示例的前两行似乎具有相同的时间。但看起来我的输出与您想要的数据框df1 相同。你想要不同的结果吗?如果是这样,请随时将您的问题编辑为您最终的df1 应该是什么样子,并带有同时具有活动的示例数据。我很乐意回来再次编辑答案。
    【解决方案2】:
    node <- c("ABCC","ABCC","ABCC","ABCC","ABCC","ABCC","ABCC","ABCC","ABCC")
    activity <-c("NODE_ISOLATION","NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF","NODE_ISOLATION","LOSS_OF_MULTIPLEX_SECTION-OMS_A","NODE_ISOLATION","NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF","NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF", "UNDERLYING_RESOURCE_UNAVAILABLE-OMS_A","UNDERLYING_RESOURCE_UNAVAILABLE-OMS_A") 
    
    e <-c("2020-05-08 16:11:58","2020-05-08 16:11:58","2020-05-08 16:30:07","2020-05-09 03:00:08","2020-05-09 03:08:08","2020-05-09 03:28:08","2020-05-09 13:08:08","2020-05-09 13:10:08","2020-05-09 13:28:08")
    
    df <- data.frame(node, activity, e)
    
    library(dplyr)
    ##### Change e format 
    df <- df %>% mutate(e=as.POSIXct(e))
    
    ##### 1st Breaks option automatic 
    start <- as.POSIXct("2020-05-08 16:00:00") # Can be min(as.POSIXct(df$e))
    end <- max(as.POSIXct(df$e))+30*60
    breaks_a <- seq(start,end,"30 min")
    
    ##### 2nd Breaks option manual
    breaks_m <- c(as.POSIXct("2020-05-08 16:00:00"),
                  as.POSIXct("2020-05-08 16:31:00"),
                  as.POSIXct("2020-05-09 03:30:00"),
                  as.POSIXct("2020-05-09 13:30:00"))
    
    df <- df %>% mutate(e_cut_automatic=cut(e,breaks_a),
                        e_cut_manual=cut(e,breaks_m))
    
    count_act <- function(x) data.frame(node=x$node[1],
                                        activity=paste(x$activity,collapse=","),
                                        cnt=nrow(x))
    
    count_n <- function(x) do.call("rbind",lapply(split(x,x$node),count_act))
    
    
    ##### With automatic breaks (recomend)
    r <- do.call("rbind",lapply(split(df,df$e_cut_automatic),count_n))
    print(r,row.names = FALSE)
    

    输出:

    ##### With manual breaks
    r <- do.call("rbind",lapply(split(df,df$e_cut_manual),count_n))
    print(r,row.names = FALSE)
    

    输出:

    【讨论】:

    • 大数据集有什么建议
    • 当然,看看我的新答案。
    【解决方案3】:

    试试这个:

    node <- c("ABCC","ABCC","ABCC","ABCC","ABCC","ABCC","ABCC","ABCC","ABCC")
    activity <-c("NODE_ISOLATION","NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF","NODE_ISOLATION","LOSS_OF_MULTIPLEX_SECTION-OMS_A","NODE_ISOLATION","NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF","NE_NOT_REACH_VIA_PRIMARY_MNG_INTERF", "UNDERLYING_RESOURCE_UNAVAILABLE-OMS_A","UNDERLYING_RESOURCE_UNAVAILABLE-OMS_A") 
    e <-c("2020-05-08 16:11:58","2020-05-08 16:11:58","2020-05-08 16:30:07","2020-05-09 03:00:08","2020-05-09 03:08:08","2020-05-09 03:28:08","2020-05-09 13:08:08","2020-05-09 13:10:08","2020-05-09 13:28:08")
    
    df <- data.frame(node, activity, e)
    
    as.data.table(df) %>% mutate(e_cut=cut(as.POSIXct(e),"30 min")) %>%
      group_by(node, e_cut) %>%
      summarise(activity=paste(activity,collapse=","),cnt = n())%>%
      select(-e_cut)
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2021-06-20
      • 2015-03-07
      • 2019-06-23
      • 1970-01-01
      • 1970-01-01
      • 2020-10-07
      • 1970-01-01
      相关资源
      最近更新 更多