根据其他数据帧中指定的不同时间范围对数据帧进行子集答案

【问题标题】：Subsetting a data frame according to different time ranges specified in other data frame根据其他数据帧中指定的不同时间范围对数据帧进行子集
【发布时间】：2020-07-01 09:51:03
【问题描述】：

我这里有两个数据框。

SCORE_df <- data.frame("Participant" = rep(1:2, times=1, each=7), "Score" = as.integer(runif(14, 0, 100)), "Time" = c('17:00:00', '17:00:01', '17:00:02', '17:00:03', '17:00:04', '17:00:05', '17:00:06', '19:50:30', '19:50:31', '19:50:32', '19:50:33', '19:50:34', '19:50:35', '19:50:36'))
              
TIME_df <- data.frame("Participant" = c(1,2), "Start" = c('17:00:02', '19:50:31'), "End" = c('17:00:05', '19:50:33'))

> SCORE_df
   Participant Score     Time
1            1    56 17:00:00
2            1    77 17:00:01
3            1    27 17:00:02
4            1    78 17:00:03
5            1    46 17:00:04
6            1    22 17:00:05
7            1    35 17:00:06
8            2    26 19:50:30
9            2    64 19:50:31
10           2    29 19:50:32
11           2    29 19:50:33
12           2    90 19:50:34
13           2     0 19:50:35
14           2    51 19:50:36

> TIME_df
  Participant    Start      End
1           1 17:00:02 17:00:05
2           2 19:50:31 19:50:33

我想使用 TIME_df 中的开始和结束信息来对 SCORE_df 数据进行子集化 - 即，只保留“每个”参与者“开始到结束时间”（包括）“内”的分数数据。

下面的行做了错误的子集。

for(p in 1:nrow(SCORE_df)){
  ppt <- SCORE_df$Participant[p]
  SCORE_df_trimmed <- with(SCORE_df[which(SCORE_df$Participant==ppt),], SCORE_df[strptime(Time, "%H:%M:%S") >= strptime(TIME_df$Start[TIME_df$Participant==ppt], "%H:%M:%S") & strptime(Time, "%H:%M:%S") <= strptime(TIME_df$End[TIME_df$Participant==ppt], "%H:%M:%S"),])
}

> SCORE_df_trimmed
   Participant Score     Time
2            1    77 17:00:01
3            1    27 17:00:02
4            1    78 17:00:03
9            2    64 19:50:31
10           2    29 19:50:32
11           2    29 19:50:33

如果有人能查明上述行中的错误，我将不胜感激。

【问题讨论】：

标签： r dataframe time subset

【解决方案1】：

data.table 的选项：

library(data.table)
setDT(SCORE_df)
setDT(TIME_df)

# POSIXct works with a date reference, so I add in just any date
SCORE_df[, Time := as.POSIXct(paste0("2000-01-01 ", Time), tz = "UTC")]
TIME_df[,  Start := as.POSIXct(paste0("2000-01-01 ", Start), tz = "UTC")]
TIME_df[,  End := as.POSIXct(paste0("2000-01-01 ", End), tz = "UTC")]


SCORE_df[TIME_df, 
         on = .(Participant = Participant, Time >= Start, Time <= End), 
         .(Participant, Score, Time = x.Time)]


#    Participant Score                Time
# 1:           1    49 2000-01-01 17:00:02
# 2:           1    75 2000-01-01 17:00:03
# 3:           1     8 2000-01-01 17:00:04
# 4:           1     7 2000-01-01 17:00:05
# 5:           2    49 2000-01-01 19:50:31
# 6:           2    59 2000-01-01 19:50:32
# 7:           2    13 2000-01-01 19:50:33

【讨论】：

【解决方案2】：

你可以用这个：

library(lubridate)

SCORE_df$Time <- hms(SCORE_df$Time)
TIME_df$Start <- hms(TIME_df$Start)
TIME_df$End <- hms(TIME_df$End)

library(dplyr)

SCORE_df1 <- SCORE_df %>%
        filter(Participant == 1 & Time >= TIME_df$Start[1] & Time <= TIME_df$End[1])

SCORE_df2 <- SCORE_df %>%
        filter(Participant == 2 & Time >= TIME_df$Start[2] & Time <= TIME_df$End[2])

rbind(SCORE_df1, SCORE_df2)

  Participant Score        Time
1           1    82   17H 0M 2S
2           1    35   17H 0M 3S
3           1    46   17H 0M 4S
4           1    42   17H 0M 5S
5           2    44 19H 50M 31S
6           2    61 19H 50M 32S
7           2    69 19H 50M 33S

【讨论】：

【解决方案3】：

使用tidyverse中的函数你可以试试

library(tidyverse)
TIME_df %>% 
mutate_at(vars(Start, End), lubridate::hms) %>% 
   pmap(., ~SCORE_df %>% 
       mutate(Time = lubridate::hms(Time)) %>% 
       filter(Time >= ..2 & Time <= ..3)) %>% 
  bind_rows()
Participant Score        Time
1           1    21   17H 0M 2S
2           1    19   17H 0M 3S
3           1    83   17H 0M 4S
4           1    92   17H 0M 5S
5           2    23 19H 50M 31S
6           2    65 19H 50M 32S
7           2    70 19H 50M 33S

无需任何改造即可尝试

SCORE_df %>% 
  left_join(pivot_longer(TIME_df, c(Start, End)), by=c("Participant", "Time" = "value"))  %>% 
  group_by(Participant) %>% 
  slice(which(name=="Start"):which(name=="End"))
# A tibble: 7 x 4
# Groups:   Participant [2]
Participant Score Time     name 
<dbl> <int> <chr>    <chr>
1           1    21 17:00:02 Start
2           1    19 17:00:03 NA   
3           1    83 17:00:04 NA   
4           1    92 17:00:05 End  
5           2    23 19:50:31 Start
6           2    65 19:50:32 NA   
7           2    70 19:50:33 End

【讨论】：