基于来自不同大小数据集的值范围的 R ifelse 语句答案

【问题标题】：R ifelse statement based on range of values from different size datasets基于来自不同大小数据集的值范围的 R ifelse 语句
【发布时间】：2017-12-12 15:20:08
【问题描述】：

我有两个不同大小的数据集，其中包含人们服用两种不同药物的开始时间和停止时间。我想将它们结合起来，以便每次从任一数据集中显式包含两种药物使用的相应变量（0/1）。

示例数据：

library(dplyr)    
set.seed(100)
    df <- data.frame (id=c(1,1,1,1,2,2,2,3,3,3),
                      start=c(0,10,16,21,0,13,21,0,6,9),
                      stop=c(9,15,20,24,12,20,25,5,8,14),
                      drugA=rbinom(10,1,0.5))

df2 <- data.frame (id=c(1,1,2,2,3,3),
                   start=c(12,20,2,12,17,22),
                   stop=c(18,25,8,17,19,25),
                   drugB=c(1,1,1,1,1,1))

drugA/drugB 代表两种药物，其中 1 表示它们正在服用药物，0 表示没有服用。对于df2，对于未显示的任何时间意味着他们在此期间没有服用药物，例如id=1 在 0-12 时间不在 drugB 上。

我想要的结果数据集是：

finaldf<-structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), 
    start = c(0L, 2L, 5L, 6L, 8L, 9L, 10L, 12L, 13L, 14L, 15L, 
    16L, 17L, 18L, 19L, 20L, 21L, 22L, 24L, 0L, 2L, 5L, 6L, 8L, 
    9L, 10L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 
    22L, 24L, 0L, 2L, 5L, 6L, 8L, 9L, 10L, 12L, 13L, 14L, 15L, 
    16L, 17L, 18L, 19L, 20L, 21L, 22L, 24L), stop = c(2L, 5L, 
    6L, 8L, 9L, 10L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 
    20L, 21L, 22L, 24L, 25L, 2L, 5L, 6L, 8L, 9L, 10L, 12L, 13L, 
    14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 24L, 25L, 2L, 
    5L, 6L, 8L, 9L, 10L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 
    20L, 21L, 22L, 24L, 25L), drugA = c(0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 
    1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L), drugB = c(0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 
    1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 
    1L, 0L, 0L, 0L, 1L, 1L)), .Names = c("id", "start", "stop", 
"drugA", "drugB"), class = "data.frame", row.names = c(NA, -57L
))

到目前为止，我尝试获取数据集的整体形状是：

t<-sort(unique(c(df$start,df$stop ,df2$start,df2$stop)))  #list all the times
finaldf<-data.frame(id = rep(unique(df$id), each = length(t)))
finaldf$stop<-rep(t, each = length(finaldf))
finaldf<- finaldf %>%
          group_by(id) %>%
          mutate(start = lag(stop)) %>%
          ungroup()
finaldf<-filter(finaldf,start>=0)

现在我想根据我遇到问题的两个数据集中的原始时间创建 drugA 和 drugB 变量。我尝试过使用dplyr 中的ifelse 语句通过id 来执行此操作的尴尬尝试，但我显然正在使用不同大小的数据集，所以不确定这是否是正确的方法？

finaldf<-finaldf  %>% group_by(id) %>% 
  mutate(drugA=ifelse(df$drugA==1 & df$start>=finaldf$start & df$stop<=finaldf$stop ,1,0),
         drugB=ifelse(df2$drugB==1 & df2$start>=finaldf$start & df2$stop<=finaldf$stop ,1,0)) %>% ungroup()

任何帮助将不胜感激。谢谢

【问题讨论】：

你最后想要什么？所以请包括您的预期输出。
我已经在上面的代码中给出了我想要的最终数据集，参见上面的structure(list..。在运行边际结构模型时，我需要 drugA 和 drugB 的时变变量。

标签： r if-statement dataframe dplyr

【解决方案1】：

最简单的可能是首先将所有内容转换为更简单的长格式。具体来说，我会将所有内容转换为每单位时间有一行（即时间 1 的状态条目，时间 2 的状态条目等）。

为此，我首先按 id 拆分 data.frame（以便以后填充空白），然后按行拆分（以延长每个周期以在单位时间内有一个条目）。然后，对于每种药物，我使用来自tidyr 的complete 填写任何缺失的时间（假设它们不在药物上）。您在此处的设计意味着每个人在研究中的时间都相同，但如果不是这样，您可以简单地在 lapply 函数中为每个人重新定义 allTimes。

maxTime <- max(c(df$stop, df2$stop))

allTimes <-
  0:maxTime

allIds <-
  c(df$id, df2$id) %>%
  unique %>%
  sort

fullData <-
  lapply(allIds, function(thisID){
    tempA <-
      df %>%
      filter(id == thisID) %>%
      split(1:nrow(.)) %>%
      lapply(function(thisSet){
        data_frame(
          id = thisID
          , time = thisSet$start:thisSet$stop
          , drugA = thisSet$drugA
        )
      }) %>%
      bind_rows %>%
      complete(time = allTimes, fill = list(id = thisID, drugA = 0))

    tempB <-
      df2 %>%
      filter(id == thisID) %>%
      split(1:nrow(.)) %>%
      lapply(function(thisSet){
        data_frame(
          id = thisID
          , time = thisSet$start:thisSet$stop
          , drugB = thisSet$drugB
        )
      }) %>%
      bind_rows %>%
      complete(time = allTimes, fill = list(id = thisID, drugB = 0))

    out <-
      left_join(tempA, tempB)
  }) %>%
  bind_rows

这段代码给出了整个数据的快照：

fullData %>%
  group_by(id) %>%
  slice(c(1,5,10,15))

    time    id drugA drugB
   <int> <dbl> <dbl> <dbl>
 1     0     1     0     0
 2     4     1     0     0
 3     9     1     0     0
 4    14     1     0     1
 5     0     2     0     0
 6     4     2     0     1
 7     9     2     0     0
 8    14     2     0     1
 9     0     3     0     0
10     4     3     0     0
11     9     3     0     0
12    14     3     0     0

我的猜测是，对于您计划的任何后续步骤，这种长表格实际上可能会更好地工作（因为您可以单独对待每一天，例如绘制参与者状态图（这里使用 ggplot2）

fullData %>%
  mutate(drugState = paste(drugA, drugB, sep = "-")) %>%
  ggplot(aes(x = time
             , y = id
             , fill = drugState)) +
  geom_tile(height = 0.9) +
  scale_fill_manual(values = RColorBrewer::brewer.pal(4, "Set1")[c(3,1,2,4)] )

但是，如果您真的想恢复原来的起停模式，您可以确定药物状态发生变化的时间点，然后为每个人总结该时期的情况：

reformatted <-
  fullData %>%
  group_by(id) %>%
  mutate(
    drugChange =
      (drugA != lag(drugA, default = -1)) |
      (drugB != lag(drugB, default = -1))
    , period = cumsum(drugChange)
  ) %>%
  select(-drugChange) %>%
  group_by(id, period, drugA, drugB) %>%
  summarise(start = min(time)
            , stop = max(time)) %>%
  ungroup()

      id period drugA drugB start  stop
   <dbl>  <int> <dbl> <dbl> <dbl> <dbl>
 1     1      1     0     0     0    11
 2     1      2     0     1    12    15
 3     1      3     1     1    16    18
 4     1      4     1     0    19    19
 5     1      5     1     1    20    20
 6     1      6     0     1    21    25
 7     2      1     0     0     0     1
 8     2      2     0     1     2     8
 9     2      3     0     0     9    11
10     2      4     0     1    12    17
11     2      5     0     0    18    20
12     2      6     1     0    21    25
13     3      1     0     0     0     5
14     3      2     1     0     6     8
15     3      3     0     0     9    16
16     3      4     0     1    17    19
17     3      5     0     0    20    21
18     3      6     0     1    22    25

【讨论】：

非常感谢！我实际上想要时变分析的开始、停止格式。
很高兴它对您有用。这能让你一直得到你所需要的吗？如果是这样，请考虑接受答案，让其他人知道它可以解决您的问题 (stackoverflow.com/help/someone-answers)。如果没有，请告诉我它缺少什么，我可能很容易扩展答案。
它做得很好，我只是在等着看是否有另一种解决方案，也许是在接受答案之前使用 ifelse 语句的基本 R 类型解决方案。也许有一种更有效的方法（当前运行时间大约 25 分钟），尽管我知道任何方法都需要时间，因为我们正在创建一个大型数据集。如果没有，我会接受你的！再次感谢。