【问题标题】:Tidymodels: Decision Tree Learning in R - Error: No variables or terms were selectedTidymodels:R 中的决策树学习 - 错误:未选择变量或术语
【发布时间】:2021-02-28 08:16:34
【问题描述】:

概述:

我有一个名为“FID”的数据框,我正在尝试按照本教程(见下文)生成三个模型:(1) 袋装树; (2) 随机森林; (3) 增强树。

教程:

https://bcullen.rbind.io/post/2020-06-02-tidymodels-decision-tree-learning-in-r/

问题

当我尝试运行模型时,"fit_bag"fit_rf"fit_boost",我遇到了以下错误消息。我认为问题可能出在预处理阶段。

有人能就这个问题提供建议吗?

非常感谢。

错误 - 未选择任何变量

i Fold01: recipe
x Fold01: recipe: Error: No variables or terms were selected.
i Fold02: recipe
x Fold02: recipe: Error: No variables or terms were selected.
i Fold03: recipe
x Fold03: recipe: Error: No variables or terms were selected.
i Fold04: recipe
x Fold04: recipe: Error: No variables or terms were selected.
i Fold05: recipe
x Fold05: recipe: Error: No variables or terms were selected.
i Fold06: recipe
x Fold06: recipe: Error: No variables or terms were selected.
i Fold07: recipe
x Fold07: recipe: Error: No variables or terms were selected.
i Fold08: recipe
x Fold08: recipe: Error: No variables or terms were selected.
i Fold09: recipe
x Fold09: recipe: Error: No variables or terms were selected.
i Fold10: recipe
x Fold10: recipe: Error: No variables or terms were selected.
Warning message:
All models failed in [fit_resamples()]. See the `.notes` column. 

R 代码

    ##Open library packages
    library(tidymodels) 
    library(tidyverse) # manipulating data
    library(skimr) # data visualization
    library(baguette) # bagged trees
    library(future) # parallel processing & decrease computation time
    library(xgboost) # boosted trees
    
    # split the data
    split <- initial_split(Tidmodel_df)
    
    # extract the training data
    train <- training(split)
    
    # resample the data with 10-fold cross-validation (10-fold by default)
    cv <- vfold_cv(train)
    
##Preprocessing

rec <- recipe(Frequency~., data=train) %>% 
       update_role(contains("id"), Year, Month, Monsoon, Days, new_role = "id vars") %>% # declare ID variables
       step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
       step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels 
       step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars"))  %>% # replaces missing numeric observations with the median
       step_dummy(all_nominal(), -has_role("id vars")) # dummy codes categorical variables
    
    ###########################################################
    ##Create Models
    ###########################################################

    #####Bagged Trees
    mod_bag <- bag_tree() %>%
                set_mode("regression") %>%
                 set_engine("rpart", times = 10) # 10 bootstrap resamples
    
    ##Create workflow
    wflow_bag <- workflow() %>% 
                       add_recipe(rec) %>%
                           add_model(mod_bag)
    
    ##Fit the model
    plan(multisession)
    
    fit_bag <- fit_resamples(
                       wflow_bag,
                       cv,
                       metrics = metric_set(rmse, rsq),
                       control = control_resamples(verbose = TRUE,
                       save_pred = TRUE,
                       extract = function(x) extract_model(x)))
    
    ##Random forests
    
    mod_rf <-rand_forest() %>%
                          set_engine("ranger",
                          num.threads = parallel::detectCores(), 
                          importance = "permutation", 
                          verbose = TRUE) %>% 
                          set_mode("regression") %>% 
                          set_args(trees = 1000)
    
    ##Create Workflow
    
    wflow_rf <- workflow() %>% 
                   add_model(mod_rf) %>% 
                         add_recipe(rec)
    
    ##Fit the model
    
    plan(multisession)
    
    fit_rf <- fit_resamples(
                         wflow_rf,
                         cv,
                         metrics = metric_set(rmse, rsq),
                         control = control_resamples(verbose = TRUE,
                         save_pred = TRUE,
                         extract = function(x) x)
                         )
    
    ##Boosted Trees
    
    mod_boost <- boost_tree() %>% 
                  set_engine("xgboost", nthreads = parallel::detectCores()) %>% 
                  set_mode("regression")
    
    ##Create workflow
    
    wflow_boost <- workflow() %>% 
                      add_recipe(rec) %>% 
                      add_model(mod_boost)
    
    ##Fit model
    
    plan(multisession)
    
    fit_boost <- fit_resamples(
                           wflow_boost, 
                           cv,
                           metrics = metric_set(rmse, rsq),
                           control = control_resamples(verbose = TRUE,
                           save_pred = TRUE)
                           )

数据框 - FID

structure(list(Year = c(2015, 2015, 2015, 2015, 2015, 2015, 2015, 
2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016, 2016, 2016, 
2016, 2016, 2016, 2016, 2016, 2016, 2017, 2017, 2017, 2017, 2017, 
2017, 2017, 2017, 2017, 2017, 2017, 2017), Month = structure(c(1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
8L, 9L, 10L, 11L, 12L), .Label = c("January", "February", "March", 
"April", "May", "June", "July", "August", "September", "October", 
"November", "December"), class = "factor"), Monsoon = structure(c(2L, 
2L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 4L, 
4L, 4L, 4L, 4L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 
3L, 3L, 2L), .Label = c("First_Inter_Monssoon", "North_Monsoon", 
"Second_Inter_Monsoon", "South_Monsson"), class = "factor"), 
    Frequency = c(36, 28, 39, 46, 5, 0, 0, 22, 10, 15, 8, 
    33, 33, 29, 31, 23, 8, 9, 7, 40, 41, 41, 30, 30, 44, 37, 
    41, 42, 20, 0, 7, 27, 35, 27, 43, 38), Days = c(31, 
    28, 31, 30, 6, 0, 0, 29, 15, 29, 29, 31, 31, 29, 30, 30, 
    7, 0, 7, 30, 30, 31, 30, 27, 31, 28, 30, 30, 21, 0, 7, 26, 
    29, 27, 29, 29)), row.names = c(NA, -36L), class = "data.frame")

【问题讨论】:

    标签: r machine-learning regression prediction tidymodels


    【解决方案1】:

    这里的问题是,当您使用update_role(contains("id"), Year, Month, Monsoon, Days, new_role = "id vars") 时,您将所有变量(例如YearMonthMonsoon 等)的角色更新为"id vars",然后它们就不再是预测变量了.当配方进入下一个预处理步骤时,它发现根本没有任何预测变量。

    如果您想将这些变量用作预测变量,请保持其角色不变,不要将其更改为 "id vars" 之类的其他内容:

    library(tidymodels) 
    library(baguette) # bagged trees
    
    fid_df <- structure(list(Year = c(2015, 2015, 2015, 2015, 2015, 2015, 2015, 
                                      2015, 2015, 2015, 2015, 2015, 2016, 2016, 2016, 2016, 2016, 2016, 
                                      2016, 2016, 2016, 2016, 2016, 2016, 2017, 2017, 2017, 2017, 2017, 
                                      2017, 2017, 2017, 2017, 2017, 2017, 2017), Month = structure(c(1L, 
                                                                                                     2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 
                                                                                                     5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 
                                                                                                     8L, 9L, 10L, 11L, 12L), .Label = c("January", "February", "March", 
                                                                                                                                        "April", "May", "June", "July", "August", "September", "October", 
                                                                                                                                        "November", "December"), class = "factor"), Monsoon = structure(c(2L, 
                                                                                                                                                                                                          2L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 4L, 
                                                                                                                                                                                                          4L, 4L, 4L, 4L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 
                                                                                                                                                                                                          3L, 3L, 2L), .Label = c("First_Inter_Monssoon", "North_Monsoon", 
                                                                                                                                                                                                                                  "Second_Inter_Monsoon", "South_Monsson"), class = "factor"), 
                             Frequency = c(36, 28, 39, 46, 5, 0, 0, 22, 10, 15, 8, 
                                           33, 33, 29, 31, 23, 8, 9, 7, 40, 41, 41, 30, 30, 44, 37, 
                                           41, 42, 20, 0, 7, 27, 35, 27, 43, 38), Days = c(31, 
                                                                                           28, 31, 30, 6, 0, 0, 29, 15, 29, 29, 31, 31, 29, 30, 30, 
                                                                                           7, 0, 7, 30, 30, 31, 30, 27, 31, 28, 30, 30, 21, 0, 7, 26, 
                                                                                           29, 27, 29, 29)), row.names = c(NA, -36L), class = "data.frame")
    
    # split the data
    fid_split <- initial_split(fid_df)
    
    # extract the training data
    fid_train <- training(fid_split)
    
    # resample the data with 10-fold cross-validation (10-fold by default)
    cv <- vfold_cv(fid_train)
    
    ##Preprocessing
    
    rec <- recipe(Frequency ~ ., data = fid_df) %>% 
      step_nzv(all_predictors(), freq_cut = 0, unique_cut = 0) %>% # remove variables with zero variances
      step_novel(all_nominal()) %>% # prepares test data to handle previously unseen factor levels 
      step_medianimpute(all_numeric(), -all_outcomes(), -has_role("id vars"))  %>% # replaces missing numeric observations with the median
      step_dummy(all_nominal(), -has_role("id vars")) # dummy codes categorical variables
    
    
    rf_spec <- rand_forest(trees = 1e3) %>%
      set_engine("ranger", importance = "permutation") %>% 
      set_mode("regression")
    
    wflow_rf <- workflow() %>% 
      add_model(rf_spec) %>% 
      add_recipe(rec)
    
    fit_resamples(
      wflow_rf,
      cv,
      metrics = metric_set(rmse, rsq),
      control = control_resamples(save_pred = TRUE)
    )
    #> 
    #> Attaching package: 'rlang'
    #> The following objects are masked from 'package:purrr':
    #> 
    #>     %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
    #>     flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
    #>     splice
    #> 
    #> Attaching package: 'vctrs'
    #> The following object is masked from 'package:tibble':
    #> 
    #>     data_frame
    #> The following object is masked from 'package:dplyr':
    #> 
    #>     data_frame
    #> # Resampling results
    #> # 10-fold cross-validation 
    #> # A tibble: 10 x 5
    #>    splits         id     .metrics         .notes           .predictions    
    #>    <list>         <chr>  <list>           <list>           <list>          
    #>  1 <split [24/3]> Fold01 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>
    #>  2 <split [24/3]> Fold02 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>
    #>  3 <split [24/3]> Fold03 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>
    #>  4 <split [24/3]> Fold04 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>
    #>  5 <split [24/3]> Fold05 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>
    #>  6 <split [24/3]> Fold06 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>
    #>  7 <split [24/3]> Fold07 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [3 × 4]>
    #>  8 <split [25/2]> Fold08 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [2 × 4]>
    #>  9 <split [25/2]> Fold09 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [2 × 4]>
    #> 10 <split [25/2]> Fold10 <tibble [2 × 4]> <tibble [0 × 1]> <tibble [2 × 4]>
    

    reprex package (v0.3.0.9001) 于 2020 年 11 月 18 日创建

    【讨论】:

    • 亲爱的朱莉娅。非常感谢您回答我的问题,我真的很感激!我一直在努力添加一个通用线性模型,以便与决策树进行模型比较,以找到最合适的模型。能否请教一下我这样做是否正确? glm 模型可以在下面的这个问题中找到!我对调整模型感到困惑。如果您认为我没有越权,请提前感谢您。
    • 这里涉及到 glm 模型的问题:stackoverflow.com/questions/64892060/…
    • 我将根据您对这个特定问题的更正来编辑 glm 问题。如果您能引导我走上正确的道路,请提前非常感谢。我不是高级 R 编码器,所以我正在尽力而为。非常感谢提前
    猜你喜欢
    • 1970-01-01
    • 2017-05-03
    • 2011-05-06
    • 2013-04-21
    • 2011-06-23
    • 2011-01-29
    • 2013-01-07
    • 2016-12-29
    • 1970-01-01
    相关资源
    最近更新 更多