【问题标题】:Tidymodels Workflow working with add_formula() or add_variables() but not with add_recipe()使用 add_formula() 或 add_variables() 但不使用 add_recipe() 的 Tidymodels 工作流程
【发布时间】:2021-04-19 10:00:27
【问题描述】:

我遇到了一些奇怪的行为,使用配方和工作流来区分垃圾邮件和使用 naiveBayes 分类器的有效文本。我试图使用 tidymodels 和工作流程来复制机器学习与 R 的第 4 章的结果:https://github.com/PacktPublishing/Machine-Learning-with-R-Second-Edition/blob/master/Chapter%2004/MLwR_v2_04.r

虽然我能够使用add_variables()add_formula() 或没有工作流来重现分析,但使用add_recipe() 函数的工作流不起作用。

library(RCurl)
library(tidyverse)
library(tidymodels)
library(textrecipes)
library(tm)
library(SnowballC) 
library(discrim) 


sms_raw <- getURL("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/sms_spam.csv")
sms_raw <- read_csv(sms_raw)
sms_raw$type <- factor(sms_raw$type)

set.seed(123)
split <- initial_split(sms_raw, prop = 0.8, strata = "type")
nb_train_sms <- training(split)
nb_test_sms <- testing(split)

# Text preprocessing
reci_sms <- 
  recipe(type ~.,
         data = nb_train_sms) %>% 
  step_mutate(text = str_to_lower(text)) %>% 
  step_mutate(text = removeNumbers(text)) %>% 
  step_mutate(text = removePunctuation(text)) %>% 
  step_tokenize(text) %>% 
  step_stopwords(text, custom_stopword_source = stopwords()) %>% 
  step_stem(text) %>% 
  step_tokenfilter(text, min_times = 6, max_tokens = 1500) %>% 
  step_tf(text, weight_scheme = "binary") %>% 
  step_mutate_at(contains("tf"), fn =function(x){ifelse(x == TRUE, "Yes", "No")}) %>% 
  prep()


df_training <- juice(reci_sms)
df_testing <- bake(reci_sms, new_data = nb_test_sms)

nb_model <- naive_Bayes() %>% 
  set_engine("klaR") 

下面是三个实际产生有效输出的代码示例

# --------- works but slow -----
nb_fit <- nb_fit <- workflow() %>%
  add_model(nb_model) %>%
  add_formula(type~.) %>%
  fit(df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)


# --------- works  -----
nb_fit <- nb_model %>% fit(type ~., df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)


# --------- works  -----

nb_fit <- workflow() %>%
  add_model(nb_model) %>%
  add_variables(outcomes = type, predictors = everything()) %>%
  fit(df_training)

nb_tidy_pred <- nb_fit %>% predict(df_testing)

虽然下面的代码不起作用

nb_fit <- workflow() %>%
  add_model(nb_model) %>%
  add_recipe(reci_sms) %>%
  fit(data = df_training)

nb_tidy_pred <- nb_fit %>% predict(df_testing)

它也会抛出以下错误,但我不太明白使用rlang::last_error()时发生了什么

Not all variables in the recipe are present in the supplied training set: 'text'.
Run `rlang::last_error()` to see where the error occurred.

谁能告诉我我错过了什么?

【问题讨论】:

    标签: r tidymodels r-recipes


    【解决方案1】:

    当您在工作流程中使用配方时,您可以将预处理步骤与模型拟合结合起来。在拟合该工作流程时,您需要使用配方所期望的数据 (nb_train_sms),而不是欧洲防风草模型所期望的数据。

    此外,它是not recommended to pass a prepped recipe to a workflow,所以在使用add_recipe() 将其添加到工作流之前,请先看看我们如何不使用prep()

    library(RCurl)
    library(tidyverse)
    library(tidymodels)
    library(textrecipes)
    library(tm) 
    library(discrim)
    
    sms_raw <- getURL("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/sms_spam.csv")
    sms_raw <- read_csv(sms_raw)
    sms_raw$type <- factor(sms_raw$type)
    
    set.seed(123)
    split <- initial_split(sms_raw, prop = 0.8, strata = "type")
    nb_train_sms <- training(split)
    nb_test_sms <- testing(split)
    
    # Text preprocessing
    reci_sms <- 
      recipe(type ~.,
             data = nb_train_sms) %>% 
      step_mutate(text = str_to_lower(text)) %>% 
      step_mutate(text = removeNumbers(text)) %>% 
      step_mutate(text = removePunctuation(text)) %>% 
      step_tokenize(text) %>% 
      step_stopwords(text, custom_stopword_source = stopwords()) %>% 
      step_stem(text) %>% 
      step_tokenfilter(text, min_times = 6, max_tokens = 1500) %>% 
      step_tf(text, weight_scheme = "binary")  %>% 
      step_mutate_at(contains("tf"), fn = function(x){ifelse(x == TRUE, "Yes", "No")})
    
    nb_model <- naive_Bayes() %>% 
      set_engine("klaR") 
    
    nb_fit <- workflow() %>%
      add_model(nb_model) %>%
      add_recipe(reci_sms) %>%
      fit(data = nb_train_sms)
    #> Warning: max_features was set to '1500', but only 1141 was available and
    #> selected.
    
    nb_tidy_pred <- nb_fit %>% predict(nb_train_sms)
    

    reprex package (v1.0.0) 于 2021-04-19 创建

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2021-10-22
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2016-01-31
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多