您的目标对我来说并不完全清楚,但这是我的阅读:如果 date.date 中的时间(忽略日期)在 start_date 和 end_date 之间,您希望按 Id 进行子集化。
我是这样处理的:
library(dplyr)
df1<-data.frame(start_date=seq(as.POSIXct("2014-09-04 00:00:00"), by = "hour", length.out = 10),
end_date=seq(as.POSIXct("2014-09-04 05:00:00"), by = "hour", length.out = 10),
values=runif(20,10,50),id=rep(seq(from=1,to=5,by=1),2))
date1<-data.frame(date=seq(as.POSIXct("2012-10-01 00:00:00"), by = "hour", length.out = 20), id=1)
date2<-data.frame(date=seq(as.POSIXct("2014-10-01 07:00:00"), by = "hour", length.out = 20), id=2)
date3<-data.frame(date=seq(as.POSIXct("2015-10-01 01:00:00"), by = "hour", length.out = 20), id=3)
df2<-data.frame(date=rbind(date1,date2,date3),values=runif(60,50,90))
df <- left_join(df1, df2, by = c("id" = "date.id")) %>%
mutate(date.date.hms = strftime(date.date, format = "%H:%M:%S"),
start_date.hms = strftime(start_date, format = "%H:%M:%S"),
end_date.hms = strftime(end_date, format = "%H:%M:%S")) %>%
mutate(date.date.hms = as.POSIXct(date.date.hms, format="%H:%M:%S"),
start_date.hms = as.POSIXct(start_date.hms, format="%H:%M:%S"),
end_date.hms = as.POSIXct(end_date.hms, format="%H:%M:%S")) %>%
group_by(id) %>%
filter(date.date.hms >= start_date.hms & date.date.hms <= end_date.hms) %>%
select(start_date, end_date, x_values = values.x, y_values = values.y, id, date.date) %>%
ungroup()
这会产生以下数据框:
> df
# A tibble: 62 x 6
start_date end_date x_values y_values id date.date
<dttm> <dttm> <dbl> <dbl> <dbl> <dttm>
1 2014-09-04 00:00:00 2014-09-04 05:00:00 31.5 77.5 1 2012-10-01 00:00:00
2 2014-09-04 00:00:00 2014-09-04 05:00:00 31.5 54.5 1 2012-10-01 01:00:00
3 2014-09-04 00:00:00 2014-09-04 05:00:00 31.5 70.3 1 2012-10-01 02:00:00
4 2014-09-04 00:00:00 2014-09-04 05:00:00 31.5 85.5 1 2012-10-01 03:00:00
5 2014-09-04 00:00:00 2014-09-04 05:00:00 31.5 82.2 1 2012-10-01 04:00:00
6 2014-09-04 00:00:00 2014-09-04 05:00:00 31.5 57.4 1 2012-10-01 05:00:00
7 2014-09-04 01:00:00 2014-09-04 06:00:00 37.0 78.8 2 2014-10-02 01:00:00
8 2014-09-04 01:00:00 2014-09-04 06:00:00 37.0 51.9 2 2014-10-02 02:00:00
9 2014-09-04 02:00:00 2014-09-04 07:00:00 34.1 85.8 3 2015-10-01 02:00:00
10 2014-09-04 02:00:00 2014-09-04 07:00:00 34.1 69.4 3 2015-10-01 03:00:00
我的方法是首先按 Id 加入 DF,然后将时间信息从日期(在 .hms 列中)拆分为字符串,并将其转换回 POSIXct 对象。这会将今天的日期添加到时间,但如果我只想对时间(而不是日期)应用过滤器,那没关系。这会产生一个 DF,其中记录在 start_date 和 end_date 内具有 date.date TIME。
现在很容易按 Id 列进行子集化。
这就是你所追求的吗?
更新
LauraR 解释说 df1 和 df2 中的日期有重叠。她在示例中更新了 df1 和 df2。通过该更新,我可以重写代码而无需将 POSIXct 转换为字符,反之亦然。看来 as.POSIXct 是一个缓慢的操作。
我现在可以执行以下操作:
- 删除所有的日期时间转换,只检查df2中的日期时间是否在df1的日期时间范围内
- 重写 dplyr 和 baseR 中的代码:我们知道管道会产生大量开销。
- 将代码转换为函数,以便我对它们进行基准测试。
附上代码:
library(dplyr)
library(microbenchmark)
df1 <- data.frame(start_date=seq(as.POSIXct("2014-09-04 00:00:00"), by = "hour", length.out = 10),
end_date=seq(as.POSIXct("2014-09-04 05:00:00"), by = "hour", length.out = 10),
values=runif(20,10,50),id=rep(seq(from=1,to=5,by=1),2))
date1 <-data.frame(date = seq(as.POSIXct("2012-09-04 02:00:00"),
by = "hour",
length.out = 20), id = 1)
date2 <-data.frame(date = seq(as.POSIXct("2014-09-03 07:00:00"),
by = "hour",
length.out = 20),id = 2)
date3 <-data.frame(date = seq(as.POSIXct("2014-09-04 01:00:00"),
by = "hour", l
ength.out = 20),id = 3)
df2 <-data.frame(date = rbind(date1,date2,date3), values = runif(60,50,90))
dplyr2 <- function(df1, df2) {
df <- left_join(df1, df2, by = c("id" = "date.id")) %>%
group_by(id) %>%
filter(date.date >= start_date &
date.date <= end_date) %>%
select(start_date,
end_date,
x_values = values.x,
y_values = values.y,
id,
date.date) %>%
ungroup()
}
baseR2 <- function(df1, df2) {
df_bR <- merge(df1, df2, by.x = "id", by.y = "date.id")
df_bR <- subset(
df_bR,
subset = df_bR$date.date >= df_bR$start_date &
df_bR$date.date <= df_bR$end_date,
select = c(start_date, end_date, values.x, values.y, id, date.date)
)
}
data_baseR <- baseR2(df1, df2)
data_dplyr <- dplyr2(df1, df2)
microbenchmark(baseR = baseR2(df1, df2),
dplyr = dplyr2(df1, df2),
times = 5)
这段代码比以前快了很多,我相信它需要更少的内存。 dplyr 和 baseR 的比较:
> data_baseR <- baseR2(df1, df2)
> microbenchmark(baseR = baseR2(df1, df2),
+ dplyr = dplyr2(df1, df2),
+ times = 5)
Unit: microseconds
expr min lq mean median uq max neval
baseR 897.5 905.3 1868.66 991.2 1041.0 5508.3 5
dplyr 5755.9 5970.2 6158.88 6277.4 6393.3 6397.6 5
显示 baseR 代码运行得更快。