基于日期间隔的列的左连接子集答案

【问题标题】：Left Join Subset of Column Based on Date Interval基于日期间隔的列的左连接子集
【发布时间】：2021-09-22 16:45:45
【问题描述】：

我有两个数据框：df1 - 由 2 个不同的 MRI 阅读器对受试者 1 进行一系列 MRI 阅读； df2 - 受试者 1 的类固醇使用列表

我正在尝试根据df1$DOS 是否包含在lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE) 中来执行df2$STATUS 到df1 的左连接。如果该df1$DOS 没有可用的df2$STATUS，我希望该值为NA。

例如，最终的 df 应该是这样的，因为 df1$DOS 位于第一行 lubridate::interval(start = df2$START_DATE, end = df2$STOP_DATE)：

输入示例

df1 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), READER = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 4L, 4L, 4L, 4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), 
    DOS = structure(c(15469, 15497, 15530, 15579, 15635, 15691, 
    15747, 15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547, 
    16678, 16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652, 
    17686, 17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079, 
    18113, 18176, 15469, 15497, 15530, 15579, 15635, 15691, 15747, 
    15805, 15901, 16010, 16101, 16202, 16293, 16422, 16547, 16678, 
    16841, 17035, 17217, 17331, 17372, 17497, 17588, 17652, 17686, 
    17714, 17770, 17826, 17882, 17938, 17994, 18050, 18079, 18113, 
    18176), class = "Date"), VISIT = c("1 Screening", "1 Week 04", 
    "1 Week 08", "1 Week 16", "Unscheduled 01", "Unscheduled 02", 
    "Unscheduled 03", "Unscheduled 04", "Unscheduled 05", "Unscheduled 06", 
    "Unscheduled 07", "Unscheduled 08", "Unscheduled 09", "Unscheduled 10", 
    "Unscheduled 11", "Unscheduled 12", "Unscheduled 13", "Unscheduled 14", 
    "Unscheduled 15", "Unscheduled", "Unscheduled 16", "Unscheduled 17", 
    "Unscheduled", "2 Screening", "2 Week 04", "2 Week 08", "2 Week 16", 
    "Unscheduled 18", "Unscheduled 19", "Unscheduled 20", "Unscheduled 21", 
    "Unscheduled 22", "Unscheduled 23", "Unscheduled 24", "Unscheduled 25", 
    "1 Screening", "1 Week 04", "1 Week 08", "1 Week 16", "Unscheduled 01", 
    "Unscheduled 02", "Unscheduled 03", "Unscheduled 04", "Unscheduled 05", 
    "Unscheduled 06", "Unscheduled 07", "Unscheduled 08", "Unscheduled 09", 
    "Unscheduled 10", "Unscheduled 11", "Unscheduled 12", "Unscheduled 13", 
    "Unscheduled 14", "Unscheduled 15", "Unscheduled", "Unscheduled 16", 
    "Unscheduled 17", "Unscheduled", "2 Screening", "2 Week 04", 
    "2 Week 08", "2 Week 16", "Unscheduled 18", "Unscheduled 19", 
    "Unscheduled 20", "Unscheduled 21", "Unscheduled 22", "Unscheduled 23", 
    "Unscheduled 24", "Unscheduled 25"), MOS_DUR = c(0, 0.903225806451613, 
    2, 3.58064516129032, 5.41935483870968, 7.25806451612903, 
    9.07142857142857, 11.0333333333333, 14.1935483870968, 17.741935483871, 
    20.7096774193548, 24.0967741935484, 27.0645161290323, 31.2903225806452, 
    35.4333333333333, 39.7096774193548, 45.0344827586207, 51.4193548387097, 
    57.3928571428571, 61.1666666666667, 62.5161290322581, 66.6, 
    69.6071428571429, 71.7333333333333, 72.8387096774194, 73.7666666666667, 
    75.5806451612903, 77.4193548387097, 79.258064516129, 81.0714285714286, 
    82.9677419354839, 84.8064516129032, 85.7666666666667, 86.8709677419355, 
    88.9333333333333, 0, 0.903225806451613, 2, 3.58064516129032, 
    5.41935483870968, 7.25806451612903, 9.07142857142857, 11.0333333333333, 
    14.1935483870968, 17.741935483871, 20.7096774193548, 24.0967741935484, 
    27.0645161290323, 31.2903225806452, 35.4333333333333, 39.7096774193548, 
    45.0344827586207, 51.4193548387097, 57.3928571428571, 61.1666666666667, 
    62.5161290322581, 66.6, 69.6071428571429, 71.7333333333333, 
    72.8387096774194, 73.7666666666667, 75.5806451612903, 77.4193548387097, 
    79.258064516129, 81.0714285714286, 82.9677419354839, 84.8064516129032, 
    85.7666666666667, 86.8709677419355, 88.9333333333333), DAYS_DUR = c(0, 
    28, 61, 110, 166, 222, 278, 336, 432, 541, 632, 733, 824, 
    953, 1078, 1209, 1372, 1566, 1748, 1862, 1903, 2028, 2119, 
    2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525, 2581, 2610, 
    2644, 2707, 0, 28, 61, 110, 166, 222, 278, 336, 432, 541, 
    632, 733, 824, 953, 1078, 1209, 1372, 1566, 1748, 1862, 1903, 
    2028, 2119, 2183, 2217, 2245, 2301, 2357, 2413, 2469, 2525, 
    2581, 2610, 2644, 2707), SPD = c(1845.52, 1658.84, 1924.06, 
    1914.49, 2094.22, 1890.46, 1369.93, 985.96, 570.47, 348.88, 
    500.76, 416.6, 407.92, 409.56, 362.33, 329.35, 326.93, 267.39, 
    304.2, 825.13, 399.47, 438.16, 800.82, 1947.56, 2357.86, 
    2959.97, 3066.57, 4235.27, 3308.1, 5321.45, 2840.12, 4162.53, 
    5379.55, 4132.86, 6570.47, 1375.57, 1744.6, 2522.15, 1828.55, 
    2013.06, 1555.05, 1294.24, 1048.76, 439.1, 402, 276.74, 121.37, 
    142.34, 72.9, 264.92, 62.27, 72.96, 68.91, 51.56, 251.33, 
    86.53, 134.48, 337.51, 1209.22, 1158.13, 1165.09, 2432.43, 
    2141.32, 3463.67, 5276.63, 3883.76, 4004.92, 4678.29, 4942.69, 
    5159.52), SPD_NADIR = c(1845.52, 1658.84, 1658.84, 1658.84, 
    1658.84, 1658.84, 1369.93, 985.96, 570.47, 348.88, 348.88, 
    348.88, 348.88, 348.88, 348.88, 329.35, 326.93, 267.39, 267.39, 
    267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 
    267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 267.39, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1294.24, 
    1048.76, 439.1, 402, 276.74, 121.37, 121.37, 72.9, 72.9, 
    62.27, 62.27, 62.27, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 
    51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 51.56, 
    51.56, 51.56), PCT_DIFF_NADIR = c(NA, -0.101153062551476, 
    0.15988280967423, 0.154113718019821, 0.262460514576451, 0.139627691639941, 
    -0.174163873550192, -0.280284394093129, -0.421406547932979, 
    -0.38843409819973, 0.435335932125659, 0.194106856225636, 
    0.169227241458381, 0.173927998165558, 0.0385519376289841, 
    -0.0559791332263242, -0.00734780628510708, -0.182118496314196, 
    0.137664086166274, 2.0858670855305, 0.493960133138861, 0.638655147911291, 
    1.99495119488388, 6.28359325330042, 7.81805602303751, 10.0698605033846, 
    10.4685291147762, 14.8392984030816, 11.3718164478851, 18.9014548038446, 
    9.62163880474214, 14.5672613037137, 19.1187404166199, 14.4562997868282, 
    23.5726092972811, NA, 0.268274242677581, 0.833530827220716, 
    0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511, 
    -0.189671158363209, -0.581315076852664, -0.0844910043270326, 
    -0.311592039800995, -0.561429500614295, 0.172777457361786, 
    -0.399357337068468, 2.63401920438957, -0.145816186556927, 
    0.171671752047535, 0.106632407258712, -0.171992933997109, 
    3.87451512800621, 0.678238944918541, 1.60822342901474, 5.54596586501164, 
    22.4526764934057, 21.4617920868891, 21.5967804499612, 46.1766873545384, 
    40.5306439100078, 66.1774631497285, 101.339604344453, 74.3250581846393, 
    76.6749418153607, 89.7348719937936, 94.8628782001551, 99.0682699767261
    ), SPD_BL = c(1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 1845.52, 
    1845.52, 1845.52, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 1375.57, 
    1375.57, 1375.57), PCT_DIFF_BL = c(NA, -0.101153062551476, 
    0.042557111274871, 0.0373715809094456, 0.134758767176731, 
    0.024350860462092, -0.257699726906238, -0.465754909185487, 
    -0.690889288655772, -0.810958429060644, -0.728661840565261, 
    -0.77426416402965, -0.778967445489618, -0.77807880705709, 
    -0.803670510208505, -0.821540812345572, -0.822852095886254, 
    -0.855114005808661, -0.835168407820018, -0.552901079370584, 
    -0.783546100827951, -0.762581819758117, -0.566073518574711, 
    0.0552906497897611, 0.277612813732715, 0.60386774459231, 
    0.661629242706663, 1.29489249642377, 0.792502926004595, 1.88344206510902, 
    0.53892669816637, 1.25547813082492, 1.91492370713945, 1.23940136113399, 
    2.56022692791192, NA, 0.268274242677581, 0.833530827220716, 
    0.329303488735579, 0.463436975217546, 0.13047682051804, -0.0591245810827511, 
    -0.237581511664256, -0.680786873805041, -0.707757511431625, 
    -0.798817944561164, -0.911767485478747, -0.896522895963128, 
    -0.947003787520809, -0.807410746090711, -0.954731493126486, 
    -0.946960169238934, -0.949904403265555, -0.96251735644133, 
    -0.817290286935598, -0.93709516782134, -0.902236890888868, 
    -0.754639894734546, -0.120931686500869, -0.158072653518178, 
    -0.153012932820576, 0.768306956388988, 0.556678322440879, 
    1.51798890641698, 2.83595891157847, 1.82338230697093, 1.91146215750562, 
    2.40098286528494, 2.59319409408463, 2.75082329507041), NLFLG = c(0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 
    1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
    0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1), CLINDET_DT = structure(c(18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 18177, 
    18177, 18177, 18177, 18177, 18177, 18177), class = "Date")), row.names = c(NA, 
-70L), groups = structure(list(SUBJID = c(1, 1), READER = structure(c(1L, 
4L), .Label = c("1", "2", "3", "4", "5"), class = "factor"), 
    .rows = structure(list(1:35, 36:70), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, -2L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

df2 = structure(list(SUBJID = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1), DRUG = structure(c("Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone", "Dexamethasone", "Dexamethasone", 
"Dexamethasone", "Dexamethasone"), label = "Agent Name", format.sas = "$"), 
    ROUTE = structure(c("PO", "IVI", "IVI", "PO", "PO", "PO", 
    "PO", "PO", "IVI", "PO", "PO", "PO", "PO", "PO", "PO", "PO"
    ), label = "Route", format.sas = "$"), START_DT = structure(c(15455, 
    15469, 15470, 15471, 15472, 15473, 15474, 15475, 17653, 18061, 
    18079, 18115, 18122, 18129, 18136, 18142), class = "Date"), 
    STOP_DT = structure(c(15468, 15469, 15470, 15471, 15472, 
    15473, 15474, 15475, 17653, 18067, 18114, 18121, 18128, 18135, 
    18141, NA), class = "Date"), DAYS_DUR = c(13, 0, 0, 0, 0, 
    0, 0, 0, 0, 6, 35, 6, 6, 6, 5, NA), DEXEQ = c(0, 10, 30, 
    8, 8, 4, 2, 1, 4, 4, 4, 3.5, 3, 2.5, 2, 8), PHYSFLAG = c(0, 
    1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1), STATUS = c("STABLE", 
    "INCREASE", "INCREASE", "INCREASE", "INCREASE", "INCREASE", 
    "STABLE", "STABLE", "INCREASE", "INCREASE", "INCREASE", "INCREASE", 
    "INCREASE", "INCREASE", "STABLE", "INCREASE")), row.names = c(NA, 
-16L), groups = structure(list(SUBJID = 1, .rows = structure(list(
    1:16), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", 
"list"))), row.names = c(NA, -1L), class = c("tbl_df", "tbl", 
"data.frame"), .drop = TRUE), class = c("grouped_df", "tbl_df", 
"tbl", "data.frame"))

example  = structure(list(SUBJID = 1, READER = structure(1L, .Label = c("1", 
"2", "3", "4", "5"), class = "factor"), DOS = structure(15469, class = "Date"), 
    VISIT = "1 Screening", MOS_DUR = 0, DAYS_DUR = 0, SPD = 1845.52, 
    SPD_NADIR = 1845.52, PCT_DIFF_NADIR = NA_real_, SPD_BL = 1845.52, 
    PCT_DIFF_BL = NA_real_, NLFLG = 0, CLINDET_DT = structure(18177, class = "Date"), 
    STATUS = "STABLE"), row.names = c(NA, -1L), class = c("tbl_df", 
"tbl", "data.frame"))

【问题讨论】：

请通过dput()发布示例数据
请阅读r标签页顶部的发帖信息。请注意，需要一个完整的自包含示例，包括所有输入、代码和库语句，并且不应使用图像作为输入，因为除了重新输入之外，没有其他人可以轻松使用它
@Daniel 和@G.Grothendieck - 添加了dput() 示例。我很抱歉。除了图像之外，我还将示例添加为数据集。

标签： r left-join lubridate

【解决方案1】：

这是在data.table 包中使用foverlaps 的一种解决方案（dplyr 仅用于一些清洁）。基本上，我们在这里将 DOS 变量视为一个间隔，为开始日期和结束日期设置相同的值，然后我们通过重叠间隔加入。

另一种解决方案是通过 ID 到 full_join，然后例如使用 lubridate 中的 %within% 但这不会保留不匹配的行。

请注意，我们不能在df2 中包含那些开始或结束变量的日期为 NA 的行。因此，这些会在加入之前被删除。

library(data.table)
library(dplyr)

dt1 <- as.data.table(df1 %>% mutate(start = DOS, end = DOS))
dt2 <- as.data.table(df2 %>% 
                       select(SUBJID, start = START_DT, end =STOP_DT, STATUS) %>%
                       filter(!is.na(start)&!is.na(end)))
setkey(dt1, SUBJID, start, end)
setkey(dt2, SUBJID, start, end)
df3 <- foverlaps(dt1, dt2, type="within") %>% 
  select(-i.start, -i.end)

`Tidyverse`解决方案

在这里，我们应用一个完全连接和过滤器来查找所有匹配项，然后与原始df1 数据集进行一次完全连接，以保留完全不匹配的行。

library(lubridate)

df3 <- df1 %>% 
  full_join(df1 %>%
              full_join(df2 %>% 
                          select(SUBJID,START_DT, STOP_DT, STATUS), by = "SUBJID") %>%
              filter(DOS %within% interval(START_DT, STOP_DT)))

【讨论】：

您是否也可以添加lubridate/full_join 解决方案？我通常在 tidyverse 框架内工作，而不是 data.table。非常感谢！
好的，请看我的编辑。

Tidyverse解决方案

`Tidyverse`解决方案