您可以使用 K 最近邻插补。
这是 R 中的一个示例:
library(DMwR)
var1 = c('a','a','a','c','e',NA)
var2 = c('p1','p1','p1','p2','p3','p1')
var3 = c('o1','o1','o1','o2','o3','o1')
df = data.frame('v1'=var1,'v2'=var2,'v3'=var3)
df
knnOutput <- DMwR::knnImputation(df, k = 5)
knnOutput
输出:
v1 v2 v3
1 a p1 o1
2 a p1 o1
3 a p1 o1
4 c p2 o2
5 e p3 o3
6 a p1 o1
更新:
KNN 不适用于大型数据集。大型数据集的两个选项是多项式插补和朴素贝叶斯插补。多项式插补稍微容易一些,因为您不需要将变量转换为虚拟变量。我在下面展示的朴素贝叶斯实现需要做更多的工作,因为它需要您转换为虚拟变量。下面,我将展示如何在 R 中适应这些:
# make data with 6M rows
var1 = rep(c('a','a','a','c','e',NA), 10**6)
var2 = rep(c('p1','p1','p1','p2','p3','p1'), 10**6)
var3 = rep(c('o1','o1','o1','o2','o3','o1'), 10**6)
df = data.frame('v1'=var1,'v2'=var2,'v3'=var3)
####################################################################
## Multinomial imputation
library(nnet)
# fit multinomial model on only complete rows
imputerModel = multinom(v1 ~ (v2+ v3)^2, data = df[!is.na(df$v1), ])
# predict missing data
predictions = predict(imputerModel, newdata = df[is.na(df$v1), ])
####################################################################
#### Naive Bayes
library(naivebayes)
library(fastDummies)
# convert to dummy variables
dummyVars <- fastDummies::dummy_cols(df,
select_columns = c("v2", "v3"),
ignore_na = TRUE)
head(dummyVars)
dummy_cols 函数将虚拟变量添加到现有数据框中,因此现在我们将仅使用 4:9 列作为我们的训练数据。
# v1 v2 v3 v2_p1 v2_p2 v2_p3 v3_o1 v3_o2 v3_o3
# 1 a p1 o1 1 0 0 1 0 0
# 2 a p1 o1 1 0 0 1 0 0
# 3 a p1 o1 1 0 0 1 0 0
# 4 c p2 o2 0 1 0 0 1 0
# 5 e p3 o3 0 0 1 0 0 1
# 6 <NA> p1 o1 1 0 0 1 0 0
# create training set
X_train <- na.omit(dummyVars)[, 4:ncol(dummyVars)]
y_train <- na.omit(dummyVars)[, "v1"]
X_to_impute <- dummyVars[is.na(df$v1), 4:ncol(dummyVars)]
Naive_Bayes_Model=multinomial_naive_bayes(x = as.matrix(X_train),
y = y_train)
# predict missing data
Naive_Bayes_preds = predict(Naive_Bayes_Model,
newdata = as.matrix(X_to_impute))
# fill in predictions
df$multinom_preds[is.na(df$v1)] = as.character(predictions)
df$Naive_Bayes_preds[is.na(df$v1)] = as.character(Naive_Bayes_preds)
head(df, 15)
# v1 v2 v3 multinom_preds Naive_Bayes_preds
# 1 a p1 o1 <NA> <NA>
# 2 a p1 o1 <NA> <NA>
# 3 a p1 o1 <NA> <NA>
# 4 c p2 o2 <NA> <NA>
# 5 e p3 o3 <NA> <NA>
# 6 <NA> p1 o1 a a
# 7 a p1 o1 <NA> <NA>
# 8 a p1 o1 <NA> <NA>
# 9 a p1 o1 <NA> <NA>
# 10 c p2 o2 <NA> <NA>
# 11 e p3 o3 <NA> <NA>
# 12 <NA> p1 o1 a a
# 13 a p1 o1 <NA> <NA>
# 14 a p1 o1 <NA> <NA>
# 15 a p1 o1 <NA> <NA>