【问题标题】:Error adding interactions to custom VGAM::vglm model in caret在插入符号中向自定义 VGAM::vglm 模型添加交互时出错
【发布时间】:2016-06-29 21:48:08
【问题描述】:

我使用来自VGAMvglm() 使用caret 构建了一个自定义模型。它适用于简单的效果,但是当我尝试添加交互时,它会失败并显示 object 'x1:x2' not found 错误消息,其中 x1x2 是我作为交互输入模型的预测变量。问题与预测有关,除非我弄错了,否则这似乎是因为 predict.trainpredictvglm 尝试使用 x1:x2 来预测类别。

我在下面提供了一个工作示例。

# Set up data
set.seed(123)
n <- 100
x1 <- rnorm(n, 175, 7)
x2 <- rnorm(n, 30, 8)
cont <- 0.5 * x1 - 0.3 * x2 + 10 + rnorm(n, 0, 6)
y  <- cut(cont, breaks = quantile(cont), include.lowest = TRUE,
             labels = c("A", "B", "C", "D"), ordered = TRUE)
d <- data.frame(x1, x2, y)

# My custom caret function
vglmTrain <- list(
  label = "VGAM prop odds",
  library = "VGAM",
  loop = NULL,
  type = "Classification",
  parameters = data.frame(parameter = "parameter",
                          class = "character",
                          label = "parameter"),
  grid = function(x, y,
                  len = NULL, search = "grid") data.frame(parameter = "none"),
  fit = function(x, y, wts, param, lev, last, classProbs, ...) {
    dat <- if(is.data.frame(x)) x else as.data.frame(x)
    dat$.outcome <- y
    if(!is.null(wts))
    {
      out <- vglm(.outcome ~ ., propodds, data = dat, weights = wts, ...)
    } else {
      out <- vglm(.outcome ~ ., propodds, data = dat, ...)
    }
    out
  },
  predict = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
    probs <- predict(modelFit, data.frame(newdata), type = "response")

    predClass <- function (x) {
      n <- colnames(x)
      factor(as.vector(apply(x, 1, which.max)),
             levels = 1:length(n),
             labels = n)
    }
    predClass(probs)
  },
  prob = function(modelFit, newdata, preProc = NULL, submodels = NULL)
    predict(modelFit, data.frame(newdata), type = "response"),
  predictors = function(x, ...) names(attributes(terms(x))$dataClasses[-1]),
  levels = function(x) x@misc$ynames,
  sort = function(x) x)

现在,如果我尝试使用该函数,如果我为公式提供交互,它会以错误退出。

# Load caret
library(caret)

ctrl <- trainControl(method = "cv", number = 2, verboseIter = T)

# A model with no interactions - works
f1 <- train(y ~ x1 + x2, data = d,
           method = vglmTrain,
           trControl = ctrl)

# A model with interactions - fails
f2 <- train(y ~ x1*x2, data = d,
            method = vglmTrain,
            trControl = ctrl)

Error in train.default(x, y, weights = w, ...) : Stopping
In addition: Warning messages:
1: In eval(expr, envir, enclos) :
  predictions failed for Fold1: parameter=none Error in eval(expr, envir, enclos) : object 'x1:x2' not found

2: In eval(expr, envir, enclos) :
  predictions failed for Fold2: parameter=none Error in eval(expr, envir, enclos) : object 'x1:x2' not found

3: In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,  :
  There were missing values in resampled performance measures.

这是我的 sessionInfo():

> sessionInfo()
R version 3.2.4 (2016-03-10)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows >= 8 x64 (build 9200)

locale:
[1] LC_COLLATE=English_United States.1252  LC_CTYPE=English_United States.1252    LC_MONETARY=English_United States.1252 LC_NUMERIC=C                          
[5] LC_TIME=English_United States.1252    

attached base packages:
[1] splines   stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] VGAM_1.0-0      caret_6.0-64    ggplot2_2.1.0   lattice_0.20-33

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.3        magrittr_1.5       MASS_7.3-45        munsell_0.4.3      colorspace_1.2-6   foreach_1.4.3      minqa_1.2.4        stringr_1.0.0      car_2.1-1         
[10] plyr_1.8.3         tools_3.2.4        nnet_7.3-12        pbkrtest_0.4-6     parallel_3.2.4     grid_3.2.4         gtable_0.2.0       nlme_3.1-125       mgcv_1.8-12       
[19] quantreg_5.21      e1071_1.6-7        class_7.3-14       MatrixModels_0.4-1 iterators_1.0.8    lme4_1.1-11        Matrix_1.2-3       nloptr_1.0.4       reshape2_1.4.1    
[28] codetools_0.2-14   stringi_1.0-1      compiler_3.2.4     scales_0.4.0       SparseM_1.7       

有人知道如何解决这个问题吗?

【问题讨论】:

    标签: r classification r-caret vgam


    【解决方案1】:

    Caret 确实处理交互。但是,我找到了一种解决方法。您可以先调用 model.matrix 来创建具有交互作用的矩阵。您还需要删除拦截。

    以您的 f2 为例,我们将数据指定为 x 和 y,而不是公式。 x 包含带有交互的 model.matrix 规范,-1 删除截距。这已转换为 data.frame,您就可以开始使用了。

    f2 <- train(y = y, x = data.frame(model.matrix(y ~ x1*x2 - 1, data = d)),
                method = vglmTrain,
                trControl = ctrl)
    

    编辑:

    在调试了 train.default 并检查了您的模型类型规范和其他一些规范之后,我发现在插入符号模型中而不是在您的模型中进行了检查。检查与 predict 和 probs 函数有关。这些都对 Dataframe 进行了检查。如果您将此检查添加到这两个函数中,它会按预期工作。

    if (!is.data.frame(newdata)) 
      newdata <- as.data.frame(newdata)
    

    整个函数将是:

    vglmTrain <- list(
      label = "VGAM prop odds",
      library = "VGAM",
      loop = NULL,
      type = "Classification",
      parameters = data.frame(parameter = "parameter",
                              class = "character",
                              label = "parameter"),
      grid = function(x, y,
                      len = NULL, search = "grid") data.frame(parameter = "none"),
      fit = function(x, y, wts, param, lev, last, classProbs, ...) {
        dat <- if(is.data.frame(x)) x else as.data.frame(x)
        dat$.outcome <- y
        if(!is.null(wts))
        {
          out <- vglm(.outcome ~ ., propodds, data = dat, weights = wts, ...)
        } else {
          out <- vglm(.outcome ~ ., propodds, data = dat, ...)
        }
        out
      },
      predict = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
    
        if (!is.data.frame(newdata)) 
          newdata <- as.data.frame(newdata)
        probs <- predict(modelFit, newdata, type = "response")
    
        predClass <- function (x) {
          n <- colnames(x)
          factor(as.vector(apply(x, 1, which.max)),
                 levels = 1:length(n),
                 labels = n)
        }
        predClass(probs)
      },
      prob = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
        if (!is.data.frame(newdata)) 
          newdata <- as.data.frame(newdata)
    
        predict(modelFit, newdata, type = "response")
      },
    
      levels = function(x) x@misc$ynames,
    
      tags = c("Cumulative Link", "Logistic Regression", "Accepts Case Weights",
               "Probit", "Logit"),
    
      sort = function(x) x)
    

    【讨论】:

    • 谢谢!那解决了它。但是,我之前已经能够为许多插入符号函数提供公式,例如 multinom 和其他自定义公式。而且我认为插入符号已经使用了 model.matrix,但是通过它的formula.train 接口。所以我认为这里还有其他事情发生,并不是很重要。
    • @JohanLarsson,我在调试了一下之后更新了问题。
    • 谢谢。这确实解决了它。我认为问题的部分原因在于我使用了data.frame() 而不是as.data.frame,因为后者实际上不会像前者那样混淆变量名。
    【解决方案2】:

    Phiver 的解决方案在这个例子中效果很好,但是当我添加虚拟编码变量时,模型又失败了。

    我做了更多的挖掘,问题似乎确实发生了,因为data.frame 更改了要预测的数据集中列的名称。在我的代码中对predict 的两次调用中,我现在添加了data.frame(newdata, check.names = F),这似乎可以解决问题。

    现在可以同时使用公式界面

    f2 <- train(y ~ x1 * x2, data = d,
                method = vglmTrain,
                trControl = ctrl)
    

    模型矩阵法

    f2 <- train(y = y, x = data.frame(model.matrix(y ~ x1*x2 - 1, data = d)),
                method = vglmTrain,
                trControl = ctrl)
    

    这是新代码:

    vglmTrain <- list(
      label = "VGAM prop odds",
      library = "VGAM",
      loop = NULL,
      type = "Classification",
      parameters = data.frame(parameter = "parameter",
                              class = "character",
                              label = "parameter"),
      grid = function(x, y,
                      len = NULL, search = "grid") data.frame(parameter = "none"),
      fit = function(x, y, wts, param, lev, last, classProbs, ...) {
        dat <- if(is.data.frame(x)) x else as.data.frame(x)
        dat$.outcome <- y
        if(!is.null(wts))
        {
          out <- vglm(.outcome ~ ., propodds, data = dat, weights = wts, ...)
        } else {
          out <- vglm(.outcome ~ ., propodds, data = dat, ...)
        }
        out
      },
      predict = function(modelFit, newdata, preProc = NULL, submodels = NULL) {
        probs <- predict(modelFit, data.frame(newdata, check.names = F), type = "response")
    
        predClass <- function (x) {
          n <- colnames(x)
          factor(as.vector(apply(x, 1, which.max)),
                 levels = 1:length(n),
                 labels = n)
        }
        predClass(probs)
      },
      prob = function(modelFit, newdata, preProc = NULL, submodels = NULL)
       predict(modelFit, data.frame(newdata, check.names = F), type = "response"),
      levels = function(x) x@misc$ynames,
      tags = c("Cumulative Link", "Logistic Regression", "Accepts Case Weights",
               "Probit", "Logit"),
      sort = function(x) x)
    

    【讨论】:

      猜你喜欢
      • 2017-08-18
      • 1970-01-01
      • 2015-03-05
      • 2017-06-15
      • 2018-03-30
      • 2016-05-28
      • 2018-08-30
      • 2013-06-18
      • 2020-10-06
      相关资源
      最近更新 更多