【问题标题】:Adding missing hours to dataframe in R在 R 中向数据框添加缺少的小时数
【发布时间】:2021-06-09 20:04:54
【问题描述】:

我有一个数据框,其中缺少 Time GMT 中的某些时间。
通常,小时数应按 00:00 到 23:00 的顺序显示,但有时会错过一个小时。

如果序列中缺少一个小时,我想插入一个新行。
新行将是前一行的副本,但以下列更改如下:

  • Time GMT: 将包含上一行的下一个小时。即,如果先前 == 5:00,则新 == 6:00

  • Sample Measurement:将包含 Sample Measurement 列中上一个值和下一个值之间的平均值。

  • MDL: 将包含 MDL 列中上一个值和下一个值之间的平均值

我尝试了什么

library(dplyr)
library(tidyr)

class(hrOzone$Time.GMT)   # output type "character"

hrOzone %>%
  group_by(Date.GMT) %>%
  complete(Time.GMT = full_seq(01:24, 1), fill = list(count = 0))

head(hrOzone$Time.GMT, n = 100L)

数据源地址:https://drive.google.com/file/d/1o1voBktR3i8ROt1Hp59OW5t3_uukIA3j/view?usp=sharing

格林威治标准时间 格林威治标准时间 样品测量 MDL

01/03/2016 21:00:00 0.036 0.005
2016 年 1 月 3 日 22:00:00 0.035 0.007
01/03/2016 23:00:00 0.029 0.008
02/03/2016 00:00:00 0.03
02/03/2016 01:00:00 0.01
02/03/2016 02:00:00
2016 年 2 月 3 日 03:00:00
02/03/2016 04:00:00
02/03/2016 05:00:00
2016 年 2 月 3 日 07:00:00
02/03/2016 08:00:00
02/03/2016 09:00:00
2016 年 2 月 3 日 10:00:00
2016 年 2 月 3 日 11:00:00
2016 年 2 月 3 日 12:00:00
2016 年 2 月 3 日 13:00:00
2016 年 2 月 3 日 14:00:00
2016 年 2 月 3 日 16:00:00
2016 年 2 月 3 日 17:00:00
2016 年 2 月 3 日 18:00:00
2016 年 2 月 3 日 19:00:00
2016 年 2 月 3 日 20:00:00
2016 年 2 月 3 日 21:00:00
2016 年 2 月 3 日 22:00:00
2016 年 2 月 3 日 23:00:00
2016 年 2 月 3 日 00:00:00
2016 年 3 月 3 日 01:00:00
03/03/2016 02:00:00
03/03/2016 03:00:00
2016 年 3 月 3 日 04:00:00
2016 年 3 月 3 日 06:00:00
03/03/2016 08:00:00

根据 Waldi 要求更新

> dput(head(hrOzone,20))
structure(list(State.Code = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), County.Code = c(3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L), Site.Num = c(10L, 10L, 10L, 10L, 10L, 10L, 10L, 
10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L
), Parameter.Code = c(44201L, 44201L, 44201L, 44201L, 44201L, 
44201L, 44201L, 44201L, 44201L, 44201L, 44201L, 44201L, 44201L, 
44201L, 44201L, 44201L, 44201L, 44201L, 44201L, 44201L), POC = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), Latitude = c(30.497478, 30.497478, 30.497478, 30.497478, 
30.497478, 30.497478, 30.497478, 30.497478, 30.497478, 30.497478, 
30.497478, 30.497478, 30.497478, 30.497478, 30.497478, 30.497478, 
30.497478, 30.497478, 30.497478, 30.497478), Longitude = c(-87.880258, 
-87.880258, -87.880258, -87.880258, -87.880258, -87.880258, -87.880258, 
-87.880258, -87.880258, -87.880258, -87.880258, -87.880258, -87.880258, 
-87.880258, -87.880258, -87.880258, -87.880258, -87.880258, -87.880258, 
-87.880258), Datum = c("NAD83", "NAD83", "NAD83", "NAD83", "NAD83", 
"NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", 
"NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", "NAD83", 
"NAD83"), Parameter.Name = c("Ozone", "Ozone", "Ozone", "Ozone", 
"Ozone", "Ozone", "Ozone", "Ozone", "Ozone", "Ozone", "Ozone", 
"Ozone", "Ozone", "Ozone", "Ozone", "Ozone", "Ozone", "Ozone", 
"Ozone", "Ozone"), Date.Local = c("2016-03-01", "2016-03-01", 
"2016-03-01", "2016-03-01", "2016-03-01", "2016-03-01", "2016-03-01", 
"2016-03-01", "2016-03-01", "2016-03-02", "2016-03-02", "2016-03-02", 
"2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", 
"2016-03-02", "2016-03-02", "2016-03-02"), Time.Local = c("15:00", 
"16:00", "17:00", "18:00", "19:00", "20:00", "21:00", "22:00", 
"23:00", "01:00", "02:00", "03:00", "04:00", "05:00", "06:00", 
"07:00", "08:00", "10:00", "11:00", "12:00"), Date.GMT = c("2016-03-01", 
"2016-03-01", "2016-03-01", "2016-03-02", "2016-03-02", "2016-03-02", 
"2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", 
"2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02", 
"2016-03-02", "2016-03-02", "2016-03-02", "2016-03-02"), Time.GMT = c("21:00", 
"22:00", "23:00", "00:00", "01:00", "02:00", "03:00", "04:00", 
"05:00", "07:00", "08:00", "09:00", "10:00", "11:00", "12:00", 
"13:00", "14:00", "16:00", "17:00", "18:00"), Sample.Measurement = c(0.041, 
0.041, 0.042, 0.041, 0.038, 0.038, 0.036, 0.035, 0.029, 0.026, 
0.03, 0.03, 0.028, 0.027, 0.025, 0.023, 0.025, 0.034, 0.036, 
0.038), Units.of.Measure = c("Parts per million", "Parts per million", 
"Parts per million", "Parts per million", "Parts per million", 
"Parts per million", "Parts per million", "Parts per million", 
"Parts per million", "Parts per million", "Parts per million", 
"Parts per million", "Parts per million", "Parts per million", 
"Parts per million", "Parts per million", "Parts per million", 
"Parts per million", "Parts per million", "Parts per million"
), MDL = c(0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 
0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 
0.005, 0.005, 0.005), Uncertainty = c(NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Qualifier = c("", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", ""), Method.Type = c("FEM", "FEM", "FEM", "FEM", "FEM", 
"FEM", "FEM", "FEM", "FEM", "FEM", "FEM", "FEM", "FEM", "FEM", 
"FEM", "FEM", "FEM", "FEM", "FEM", "FEM"), Method.Code = c(47L, 
47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 
47L, 47L, 47L, 47L, 47L, 47L), Method.Name = c("INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET", "INSTRUMENTAL - ULTRA VIOLET", 
"INSTRUMENTAL - ULTRA VIOLET"), State.Name = c("Alabama", "Alabama", 
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", 
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", 
"Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Alabama"
), County.Name = c("Baldwin", "Baldwin", "Baldwin", "Baldwin", 
"Baldwin", "Baldwin", "Baldwin", "Baldwin", "Baldwin", "Baldwin", 
"Baldwin", "Baldwin", "Baldwin", "Baldwin", "Baldwin", "Baldwin", 
"Baldwin", "Baldwin", "Baldwin", "Baldwin"), Date.of.Last.Change = c("2016-06-20", 
"2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", 
"2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", 
"2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20", 
"2016-06-20", "2016-06-20", "2016-06-20", "2016-06-20")), row.names = c(NA, 
20L), class = "data.frame")
> 

【问题讨论】:

  • 你能dput(head(hrOzone,20))吗?下载 1.9Gb 的数据毫无意义。
  • @Waldi,我刚刚按照您的要求更新了问题
  • 如果您查看此向量,您会发现缺少6:00。它直接从 "05:00","07:00" 。时间.GMT= c("21:00", "22:00", "23:00", "00:00", "01:00", "02:00", "03:00", "04:00", "05:00", "07:00", "08:00", "09:00", "10:00", "11:00", "12:00", "13:00", "14:00", "16:00", "17:00", "18:00"),
  • 好的,谢谢,明白了!
  • 如果没有先前的值会怎样?

标签: r dplyr tidyverse


【解决方案1】:

线性插值的另一种变体:

library(dplyr)
library(purrr)
library(lubridate)

data <- hrOzone %>% select(Date.GMT,Time.GMT,Sample.Measurement) %>%
                    mutate(date  = ymd(Date.GMT) + hm(Time.GMT)) 
             
data.extended <- approx(x = data$date,
                        y = data$Sample.Measurement,
                        xout = seq(from = min(data$date),
                                   to   = max(data$date),
                                   by   = 3600))

map2_dfr(data.extended$x, data.extended$y,
         ~(list(DateTime.Gmt=.x,
                Date.GMT=format(.x,"%Y-%m-%d"),
                Time.GMT=format(.x,"%H:%M"),
                Sample.Measurement=.y))))

# A tibble: 22 x 4
   DateTime.Gmt        Date.GMT   Time.GMT Sample.Measurement
   <dttm>              <chr>      <chr>                 <dbl>
 1 2016-03-01 21:00:00 2016-03-01 21:00                0.041 
 2 2016-03-01 22:00:00 2016-03-01 22:00                0.041 
 3 2016-03-01 23:00:00 2016-03-01 23:00                0.042 
 4 2016-03-02 00:00:00 2016-03-02 00:00                0.041 
 5 2016-03-02 01:00:00 2016-03-02 01:00                0.038 
 6 2016-03-02 02:00:00 2016-03-02 02:00                0.038 
 7 2016-03-02 03:00:00 2016-03-02 03:00                0.036 
 8 2016-03-02 04:00:00 2016-03-02 04:00                0.035 
 9 2016-03-02 05:00:00 2016-03-02 05:00                0.029 
10 2016-03-02 06:00:00 2016-03-02 06:00                0.0275
# ... with 12 more rows

【讨论】:

  • 是的,这可能比合并部分更干净。
  • @Waldi,我运行了你的代码,它可以工作。据我所见,它创建了一个新的数据框,其中包含两个新变量xy,,其中x == %y/%m/%d : %hh/%mm/%ss。但是,我需要修改 hrOzone$Time.GMT 以包含完整的小时序列。
  • 查看我的编辑以获取purrr::map2_dfr的可能解决方案
【解决方案2】:

你可以使用tidyverse:

library(dplyr)
library(tidyr)
library(stringr)

hrOzone %>% 
  expand(Date.Local, Time.Local=str_c(str_pad(0:23, 2, "left", "0"), ":00")) %>%
  left_join(hrOzone, by=c("Date.Local", "Time.Local")) %>%
  arrange(Date.Local, Time.Local) %>%
  mutate(Time.GMT = ifelse(is.na(Time.GMT), 
                           str_c(str_pad(as.character((as.integer(str_sub(lead(Time.GMT), 1, 2)) +
                                          as.integer(str_sub(lag(Time.GMT), 1, 2)))/2), 2, "left", "0"), ":00"), 
                           Time.GMT),
         Sample.Measurement = ifelse(is.na(Sample.Measurement), 
                                     (lag(Sample.Measurement) + lead(Sample.Measurement))/2, 
                                     Sample.Measurement),
         MDL = ifelse(is.na(MDL), 
                      (lag(MDL) + lead(MDL))/2, 
                      MDL)) %>%
  fill(everything(), .direction="down")

【讨论】:

    【解决方案3】:

    线性插值 FTW,它也应该考虑到您有连续缺失值的情况。

    dtrng <- range(as.POSIXct(hrOzone$Date.GMT, tz="UTC") + 
             as.difftime(hrOzone$Time.GMT, format="%H:%M"))
    dts <- seq(dtrng[1], dtrng[2], by="1 hour")
    
    out <- merge(
      lapply(c(Date.GMT="%Y-%m-%d", Time.GMT="%H:%M"), format, x=dts),
      cbind(hrOzone, orig=1), all.x=TRUE
    )
    
    sel <- which(is.na(out$orig))
    vars <- c("Sample.Measurement", "MDL")
    
    out[sel, vars] <- lapply(out[vars], function(x) approx(x, xout=sel)$y)
    

    输出:

    #     Date.GMT Time.GMT Sample.Measurement   MDL orig
    #1  2016-03-01    21:00             0.0410 0.005    1
    #2  2016-03-01    22:00             0.0410 0.005    1
    #3  2016-03-01    23:00             0.0420 0.005    1
    #4  2016-03-02    00:00             0.0410 0.005    1
    #5  2016-03-02    01:00             0.0380 0.005    1
    #6  2016-03-02    02:00             0.0380 0.005    1
    #7  2016-03-02    03:00             0.0360 0.005    1
    #8  2016-03-02    04:00             0.0350 0.005    1
    #9  2016-03-02    05:00             0.0290 0.005    1
    #10 2016-03-02    06:00             0.0275 0.005   NA
    #11 2016-03-02    07:00             0.0260 0.005    1
    #12 2016-03-02    08:00             0.0300 0.005    1
    #13 2016-03-02    09:00             0.0300 0.005    1
    #14 2016-03-02    10:00             0.0280 0.005    1
    #15 2016-03-02    11:00             0.0270 0.005    1
    #16 2016-03-02    12:00             0.0250 0.005    1
    #17 2016-03-02    13:00             0.0230 0.005    1
    #18 2016-03-02    14:00             0.0250 0.005    1
    #19 2016-03-02    15:00             0.0295 0.005   NA
    #20 2016-03-02    16:00             0.0340 0.005    1
    #21 2016-03-02    17:00             0.0360 0.005    1
    #22 2016-03-02    18:00             0.0380 0.005    1
    

    【讨论】:

    • 好主意!
    【解决方案4】:

    这是使用zoona.approx 进行线性插值的tidyverse 方法。

    library(dplyr)
    library(lubridate)
    library(tidyr)
    library(zoo)
    
    hrOzone %>%
      select(Date.GMT,Time.GMT,Sample.Measurement, MDL) %>%
      unite(datetime, Date.GMT, Time.GMT, sep = ' ') %>%
      mutate(datetime = ymd_hm(datetime)) %>%
      group_by(date = as.Date(datetime)) %>%
      complete(datetime = seq(min(datetime), max(datetime), 'hour')) %>%
      mutate(across(c(Sample.Measurement, MDL), na.approx)) %>%
      ungroup
    

    对于共享的数据,此返回 -

    #         date            datetime Sample.Measurement   MDL
    #1  2016-03-01 2016-03-01 21:00:00             0.0410 0.005
    #2  2016-03-01 2016-03-01 22:00:00             0.0410 0.005
    #3  2016-03-01 2016-03-01 23:00:00             0.0420 0.005
    #4  2016-03-02 2016-03-02 00:00:00             0.0410 0.005
    #5  2016-03-02 2016-03-02 01:00:00             0.0380 0.005
    #6  2016-03-02 2016-03-02 02:00:00             0.0380 0.005
    #7  2016-03-02 2016-03-02 03:00:00             0.0360 0.005
    #8  2016-03-02 2016-03-02 04:00:00             0.0350 0.005
    #9  2016-03-02 2016-03-02 05:00:00             0.0290 0.005
    #10 2016-03-02 2016-03-02 06:00:00             0.0275 0.005
    #11 2016-03-02 2016-03-02 07:00:00             0.0260 0.005
    #12 2016-03-02 2016-03-02 08:00:00             0.0300 0.005
    #13 2016-03-02 2016-03-02 09:00:00             0.0300 0.005
    #14 2016-03-02 2016-03-02 10:00:00             0.0280 0.005
    #15 2016-03-02 2016-03-02 11:00:00             0.0270 0.005
    #16 2016-03-02 2016-03-02 12:00:00             0.0250 0.005
    #17 2016-03-02 2016-03-02 13:00:00             0.0230 0.005
    #18 2016-03-02 2016-03-02 14:00:00             0.0250 0.005
    #19 2016-03-02 2016-03-02 15:00:00             0.0295 0.005
    #20 2016-03-02 2016-03-02 16:00:00             0.0340 0.005
    #21 2016-03-02 2016-03-02 17:00:00             0.0360 0.005
    #22 2016-03-02 2016-03-02 18:00:00             0.0380 0.005
    

    【讨论】:

      猜你喜欢
      • 2014-06-03
      • 2021-11-15
      • 2018-03-07
      • 2013-02-15
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多