【问题标题】:Subsetting a data frame according to different time ranges specified in other data frame根据其他数据帧中指定的不同时间范围对数据帧进行子集
【发布时间】:2020-07-01 09:51:03
【问题描述】:

我这里有两个数据框。

SCORE_df <- data.frame("Participant" = rep(1:2, times=1, each=7), "Score" = as.integer(runif(14, 0, 100)), "Time" = c('17:00:00', '17:00:01', '17:00:02', '17:00:03', '17:00:04', '17:00:05', '17:00:06', '19:50:30', '19:50:31', '19:50:32', '19:50:33', '19:50:34', '19:50:35', '19:50:36'))
              
TIME_df <- data.frame("Participant" = c(1,2), "Start" = c('17:00:02', '19:50:31'), "End" = c('17:00:05', '19:50:33'))

> SCORE_df
   Participant Score     Time
1            1    56 17:00:00
2            1    77 17:00:01
3            1    27 17:00:02
4            1    78 17:00:03
5            1    46 17:00:04
6            1    22 17:00:05
7            1    35 17:00:06
8            2    26 19:50:30
9            2    64 19:50:31
10           2    29 19:50:32
11           2    29 19:50:33
12           2    90 19:50:34
13           2     0 19:50:35
14           2    51 19:50:36

> TIME_df
  Participant    Start      End
1           1 17:00:02 17:00:05
2           2 19:50:31 19:50:33

我想使用 TIME_df 中的开始和结束信息来对 SCORE_df 数据进行子集化 - 即,只保留“每个”参与者“开始到结束时间”(包括)“内”的分数数据。

下面的行做了错误的子集。

for(p in 1:nrow(SCORE_df)){
  ppt <- SCORE_df$Participant[p]
  SCORE_df_trimmed <- with(SCORE_df[which(SCORE_df$Participant==ppt),], SCORE_df[strptime(Time, "%H:%M:%S") >= strptime(TIME_df$Start[TIME_df$Participant==ppt], "%H:%M:%S") & strptime(Time, "%H:%M:%S") <= strptime(TIME_df$End[TIME_df$Participant==ppt], "%H:%M:%S"),])
}

> SCORE_df_trimmed
   Participant Score     Time
2            1    77 17:00:01
3            1    27 17:00:02
4            1    78 17:00:03
9            2    64 19:50:31
10           2    29 19:50:32
11           2    29 19:50:33

如果有人能查明上述行中的错误,我将不胜感激。

【问题讨论】:

    标签: r dataframe time subset


    【解决方案1】:

    data.table 的选项:

    library(data.table)
    setDT(SCORE_df)
    setDT(TIME_df)
    
    # POSIXct works with a date reference, so I add in just any date
    SCORE_df[, Time := as.POSIXct(paste0("2000-01-01 ", Time), tz = "UTC")]
    TIME_df[,  Start := as.POSIXct(paste0("2000-01-01 ", Start), tz = "UTC")]
    TIME_df[,  End := as.POSIXct(paste0("2000-01-01 ", End), tz = "UTC")]
    
    
    SCORE_df[TIME_df, 
             on = .(Participant = Participant, Time >= Start, Time <= End), 
             .(Participant, Score, Time = x.Time)]
    
    
    #    Participant Score                Time
    # 1:           1    49 2000-01-01 17:00:02
    # 2:           1    75 2000-01-01 17:00:03
    # 3:           1     8 2000-01-01 17:00:04
    # 4:           1     7 2000-01-01 17:00:05
    # 5:           2    49 2000-01-01 19:50:31
    # 6:           2    59 2000-01-01 19:50:32
    # 7:           2    13 2000-01-01 19:50:33
    

    【讨论】:

      【解决方案2】:

      你可以用这个:

      library(lubridate)
      
      SCORE_df$Time <- hms(SCORE_df$Time)
      TIME_df$Start <- hms(TIME_df$Start)
      TIME_df$End <- hms(TIME_df$End)
      
      library(dplyr)
      
      SCORE_df1 <- SCORE_df %>%
              filter(Participant == 1 & Time >= TIME_df$Start[1] & Time <= TIME_df$End[1])
      
      SCORE_df2 <- SCORE_df %>%
              filter(Participant == 2 & Time >= TIME_df$Start[2] & Time <= TIME_df$End[2])
      
      rbind(SCORE_df1, SCORE_df2)
      
        Participant Score        Time
      1           1    82   17H 0M 2S
      2           1    35   17H 0M 3S
      3           1    46   17H 0M 4S
      4           1    42   17H 0M 5S
      5           2    44 19H 50M 31S
      6           2    61 19H 50M 32S
      7           2    69 19H 50M 33S
      

      【讨论】:

        【解决方案3】:

        使用tidyverse中的函数你可以试试

        library(tidyverse)
        TIME_df %>% 
        mutate_at(vars(Start, End), lubridate::hms) %>% 
           pmap(., ~SCORE_df %>% 
               mutate(Time = lubridate::hms(Time)) %>% 
               filter(Time >= ..2 & Time <= ..3)) %>% 
          bind_rows()
        Participant Score        Time
        1           1    21   17H 0M 2S
        2           1    19   17H 0M 3S
        3           1    83   17H 0M 4S
        4           1    92   17H 0M 5S
        5           2    23 19H 50M 31S
        6           2    65 19H 50M 32S
        7           2    70 19H 50M 33S
        

        无需任何改造即可尝试

        SCORE_df %>% 
          left_join(pivot_longer(TIME_df, c(Start, End)), by=c("Participant", "Time" = "value"))  %>% 
          group_by(Participant) %>% 
          slice(which(name=="Start"):which(name=="End"))
        # A tibble: 7 x 4
        # Groups:   Participant [2]
        Participant Score Time     name 
        <dbl> <int> <chr>    <chr>
        1           1    21 17:00:02 Start
        2           1    19 17:00:03 NA   
        3           1    83 17:00:04 NA   
        4           1    92 17:00:05 End  
        5           2    23 19:50:31 Start
        6           2    65 19:50:32 NA   
        7           2    70 19:50:33 End 
        

        【讨论】:

          猜你喜欢
          • 1970-01-01
          • 2021-06-08
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 1970-01-01
          • 2013-10-25
          • 2021-07-31
          • 2021-06-25
          相关资源
          最近更新 更多