用新数据 R-Python 填充缺失值答案

【问题标题】：Fill missing values with new data R-Python用新数据 R-Python 填充缺失值
【发布时间】：2014-09-23 08:51:09
【问题描述】：

我有两个数据集 x 和 y

> x
   a index b
1  1     1 5
2 NA     2 6
3  2     3 NA
4 NA     4 9
> y
  index   a
1     2 100
2     4 101
>

我想用 y 中包含的值填充 x 的缺失值。

我尝试使用合并功能，但结果不是我想要的。

> merge(x,y, by = 'index', all=T)
  index a.x b a.y
1     1   1 5  NA
2     2  NA 6 100
3     3   2 7  NA
4     4  NA 9 101

在实际问题中还有其他限制： 1 - y 不填充所有缺失值 2 - x 和 y 有更多的共同变量（所以不仅仅是 a 和索引）

编辑：更现实的例子

> x
   a index  b  c
1  1     1  5 NA
2 NA     2  6 NA
3  2     3  NA  5
4 NA     4  9 NA
5 NA     5 10  6
> y
  index   a  c
1     2 100  4
2     4 101 NA
>

该解决方案将在 python 或 R 中都被接受

【问题讨论】：

可以做类似x$a[is.na(x$a)] <- y[na.omit(match(x$index, y$index)), "a"]的事情
x$a[is.na(x$a)] <- y$a[y$index %in% x$index] 适用于您的示例。但是您的“其他限制”并不完全清楚。请提供一个足够复杂的最小可重现示例 - 不多也不少。
@Henrik，如果x中的索引顺序不同（例如x$index <- c(1,4,3,2)）会给出错误的结果
我已经包含了一个更接近原始问题的示例
@DavidArenburg，好点子！感谢您指出这一点。

标签： r merge dataframe

【解决方案1】：

我使用了您的merge 想法并使用dplyr 做了以下操作。我相信会有更好的方法来完成这项任务。

index <- 1:5
a <- c(1, NA, 2, NA, NA)
b <- c(5,6,NA,9,10)
c <- c(NA,NA,5,NA,6)
ana <- data.frame(index, a,b,c, stringsAsFactors=F)

index <- c(2,4)
a <- c(100, 101)
c <- c(4, NA)
bob <- data.frame(index, a,c, stringsAsFactors=F)

> ana
index  a  b  c
1     1  1  5 NA
2     2 NA  6 NA
3     3  2 NA  5
4     4 NA  9 NA
5     5 NA 10  6

> bob
  index   a  c
1     2 100  4
2     4 101 NA

ana %>%
    merge(., bob, by = "index", all = TRUE) %>%
    mutate(a.x = ifelse(a.x %in% NA, a.y, a.x)) %>%
    mutate(c.x = ifelse(c.x %in% NA, c.y, c.x))

  index a.x  b c.x a.y c.y
1     1   1  5  NA  NA  NA
2     2 100  6   4 100   4
3     3   2 NA   5  NA  NA
4     4 101  9  NA 101  NA
5     5  NA 10   6  NA  NA

我使用 mutate 使用 a.y (bob$a) 覆盖了 a.x (ana$$a)。我为 c.x (ana$c) 做了类似的事情。如果你最后去掉 a.y 和 c.y，我想这将是你所期望的结果。

【讨论】：

【解决方案2】：

试试：

xa = x[,c(1,2)]
m1 = merge(y,xa,all=T)
m1 = m1[!duplicated(m1$index),]
m1$b = x$b[match(m1$index, x$index)]
m1$c = x$c[match(m1$index, x$index)]

m1
  index   a  b  c
1     1   1  5 NA
2     2 100  6 NA
4     3   2 NA  5
5     4 101  9 NA
7     5  NA 10  6

或者，如果有许多其他列，例如 b 和 c：

xa = x[,c(1,2)]
m1 = merge(y,xa,all=T)
m1 = m1[!duplicated(m1$index),]
for(nn in names(x)[3:4]) m1[,nn] = x[,nn][match(m1$index, x$index)]
m1
  index   a  b  c
1     1   1  5 NA
2     2 100  6 NA
4     3   2 NA  5
5     4 101  9 NA
7     5  NA 10  6

【讨论】：

这可能解决了a 和c 的问题，但完全搞砸了b 列
感谢您的指出。我已经完全修改了我的解决方案。

【解决方案3】：

如果要替换多列，可以尝试将wide转换成long形式，如前两种方法，一步替换

 m1 <- merge(x,y, by="index", all=TRUE)
 m1L <- reshape(m1, idvar="index", varying=grep("\\.", colnames(m1)), direction="long", sep=".")
 row.names(m1L) <- 1:nrow(m1L)
 lst1 <- split(m1L, m1L$time)
 indx <- is.na(lst1[[1]][,4:5])
 lst1[[1]][,4:5][indx] <- lst1[[2]][,4:5][indx]
 res <- lst1[[1]][,c(4,1,2,5)]
 res
 #    a index  b  c
 #1   1     1  5 NA
 #2 100     2  6  4
 #3   2     3 NA  5
 #4 101     4  9 NA
 #5  NA     5 10  6

或者你可以使用dplyr 和tidyr

 library(dplyr)
 library(tidyr)

  z <- left_join(x, y, by="index")  %>% 
            gather(Var, Val, matches("\\.")) %>%
             separate(Var, c("Var1", "Var2"))
  indx1 <- which(is.na(z$Val) & z$Var2=="x")
  z$Val[indx1] <- z$Val[indx1+nrow(z)/2]
  z %>%
    spread(Var1, Val) %>%
    filter(Var2=="x") %>% 
    select(-Var2)
  #  index  b   a  c
  #1     1  5   1 NA
  #2     2  6 100  4
  #3     3 NA   2  5
  #4     4  9 101 NA
  #5     5 10  NA  6

或split 在. 之前匹配names 列，并使用lapply 替换NA。

 indx <- grep("\\.", colnames(m1),value=TRUE)
 res <- cbind(m1[!names(m1) %in% indx],
            sapply(split(indx, gsub("\\..*", "", indx)), function(x) {
                                             x1 <- m1[x]
                                             indx1 <- is.na(x1[,1])
                                             x1[,1][indx1] <- x1[,2][indx1]
                                             x1[,1]} ))
  res
  #  index  b   a  c
  #1     1  5   1 NA
  #2     2  6 100  4
  #3     3 NA   2  5
  #4     4  9 101 NA
  #5     5 10  NA  6

【讨论】：

这个方法应该返回所有相同的结果吗？