【问题标题】:R: Replace NA with date value. If date value exists in both columns choose the earliest dateR:用日期值替换 NA。如果两列中都存在日期值,则选择最早的日期
【发布时间】:2020-04-07 07:56:34
【问题描述】:

由于其他 StackOverflow 线程上的建议,我正在处理两个数据帧并将它们合并。现在,如果它们满足某些条件,我正在努力将一列中的值获取到另一列中。

date_first_followup 列几乎为空,但由于历史使用情况,其中确实包含一些值。 Activity.Date.x 列中始终包含一个日期,但它并不总是与 date_first_followup 列相同的日期值。

我需要做的是让 R 将后续列中的所有 NA 替换为 Activity 列中的日期。每当一行在两列中都有值时,取最早的日期。

date_first_followup Activity.Date.X
NA                  NA
NA                  NA
6/30/2018           NA
4/13/2015           3/25/2015
NA                  NA
NA                  NA
NA                  3/21/2018
NA                  4/2/2018
NA                  4/13/2018
NA                  NA
4/13/2015           11/6/2016

我不知道这些字段的格式是否会影响结果,但它们从作为因素开始,我尝试了 as.Date() 下的各种格式选项,甚至只是将其作为因素。

#Trying out a coalesce based solution.
#Use Coalesce to include follow up dates from activities whenever there isn't data already
#present in the followup field.
#
testVlookup_df$Activity.Date.x <- as.Date(testVlookup_df$Activity.Date.x, format = "%/m%/d%/Y")
testVlookup_df$date_first_followup <- as.Date(testVlookup_df$date_first_followup, format = "%m/%d/%Y")
#
testVlookup_df$date_first_followup <- dplyr::coalesce(testVlookup_df$date_first_followup, testVlookup_df$ctivity.Date.x)

其他尝试

#12.10.19 attempting to ONLY pull activity dates in when the column is NULL OR when the column has a value that has a later date.
testVlookup_df$date_first_followup <- ifelse(is.null(testVlookup_df$date_first_followup) == TRUE,
                                             as.Date(testVlookup_df$Activity.Date.x, format = "%m/%d/%Y"),
                                             ifelse(isTRUE(as.Date(testVlookup_df$date_first_followup, format = "%m/%d/%Y") > as.Date(testVlookup_df$Activity.Date.x, format = "%m/%d/%Y")) == TRUE,
                                                    as.Date(testVlookup_df$Activity.Date.x, format = "%/d%/m%/Y"), NA)
)

又一次尝试

#12.12.19 58th attempt. Invert the ifelses


testVlookup_df$date_first_followup <-
  ifelse(
    isTRUE(
      as.integer(gsub("-", "", testVlookup_df$date_first_followup)) > as.integer(testVlookup_df$Activity.Date.x, "%Y/%m/%d")
    ) == TRUE,
    as.Date(testVlookup_df$date_first_followup, format = "%m/%d/%Y"),
    ifelse(
      is.null(testVlookup_df$date_first_followup) == TRUE,
      as.Date(testVlookup_df$Activity.Date.x, format = "%m/%d/%Y"),
      NA
    )
  )
class(testVlookup_df$date_first_followup)

又一次尝试

  #Attempts for if command  using elses. This seems to work but does not loop through the entire vector.                                            
    # testVlookup_df$date_first_followup <- if(is.null(testVlookup_df$date_first_followup)){ as.Date(testVlookup_df$Activity.Date.x, format = "%m/%d/%Y")
    # } else {if(as.integer(testVlookup_df$date_first_followup) > as.integer(testVlookup_df$Activity.Date.x)) {as.Date(testVlookup_df$Activity.Date.x, format = "%m/%d/%Y")
    # } else { NA }
    # }
    # Got this message 'Warning message:In if (as.integer(testVlookup_df$date_first_followup) > as.integer(testVlookup_df$Activity.Date.x)) 
    # { : the condition has length > 1 and only the first element will be used' Looked it up. Tells me if and else can only be used for 1 argument will not loop. Need sapply or ifelse

【问题讨论】:

  • 请添加一个可重现的示例以及预期的输出。

标签: r if-statement replace na


【解决方案1】:

在提出问题时,您应该提供数据样本,尤其是因为从您的问题中很难判断问题是数据格式问题还是任务执行问题。如果数据格式没有问题并且 R 将变量识别为日期(您可以通过将列传递给 str() 函数来检查),那么这应该可以工作。

#Make example data
sampleData <- data.frame(
  date_first_followup = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 12),
  Activity.Date.x = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 12)
)

naNums <- sample.int(12, 5)
sampleData[naNums, 1] <- NA

# Replace values
for (i in 1:length(sampleData$date_first_followup)){
  ifelse(is.na(sampleData[i, 1]),
     sampleData[i, 1] <- sampleData[i, 2],
     sampleData[i, 1] <- pmin(sampleData[i, 1], sampleData[i, 2]))
}

【讨论】:

    【解决方案2】:
    I finished this project a few months ago. So here is what eneded up working.
    
    # STORE DATES AS DATES
    #Store activity and followup dates as actual dates 
    activities_uniq_df$Activity.Date <-as.Date(activities_uniq_df$Activity.Date, format = "%m/%d/%Y")
    class(activities_uniq_df$Activity.Date)
    
    main_df2$date_first_followup <-as.Date(main_df2$date_first_followup, format = "%m/%d/%Y")
    class(main_df2$date_first_followup)
    
    
    #Create a subset of the main activities_uniq dataframe that only contains "Follow Up" activities. 
    
    FollowUpSub <- activities_uniq_df[activities_uniq_df$Item == "Follow Up",]
    
    ### PULL ACTIVITY DATES from Activities dataset into Main dataset based on Case ID#.
    
    #FOLLOW UPS
    
    
    #1st pull all of the desired fields in
    main_df2 <- merge(main_df2, FollowUpSub[ , c("Item", "Activity.Date", "Case.ID")],
                            by.x = "Matter.Case.ID.", by.y = "Case.ID", all.x = TRUE)
    
    which( colnames(main_df2) == "Activity.Date")
    
    #2nd Re-Format the activity and followup dates because the merge causes date to go back to factor
    main_df2$Activity.Date.x <-as.Date(main_df2$Activity.Date, format = "%m/%d/%Y")
    class(main_df2$Activity.Date)
    
    main_df2$date_first_followup <-as.Date(main_df2$date_first_followup, format = "%m/%d/%Y")
    class(main_df2$date_first_followup)
    
    #class(main_df2$date_first_followup) <- "Date"
    
    
    #Finally do the actual replacing process
    main_df2$date_first_followup <- ifelse(is.na(main_df2$date_first_followup), main_df2$Activity.Date.x, 
                                                 ifelse(main_df2$date_first_followup > main_df2$Activity.Date.x, 
                                                        main_df2$Activity.Date.x, main_df2$date_first_followup )
    )
    
    #The above step messes with followup date format again so may have to re-class it. 
    #Question does the format of dates affect what date is considered less than or greater than another?
    class(main_df2$date_first_followup) <- "Date"
    
    #Confirm successful replacements
    main_df2[c(35:39, 7001), c(26, 104)]
    names(main_df2[c(22, 104)])
    

    【讨论】:

      猜你喜欢
      • 2016-12-30
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2023-03-31
      • 1970-01-01
      • 2018-04-11
      • 1970-01-01
      相关资源
      最近更新 更多