如何处理模型时间预测生成的预测数据（融化和“未融化”） - 丢失变量答案

【问题标题】：How to handle forecast data (melt and "unmelt") generated by modeltime prediction - lost variables如何处理模型时间预测生成的预测数据（融化和“未融化”） - 丢失变量
【发布时间】：2021-11-10 13:33:32
【问题描述】：

下面我使用 tidyverse modeltime 包创建了一些虚假的预测数据。我有 2016 年的月度数据，想为 2020 年制作一个测试 fc。如您所见，我加载的数据是宽格式的。为了在模型时间中使用，我将其转换为长数据。在建模阶段之后，我想为 2020 年的预测值创建一个数据框。为此，我需要以某种方式“解开”数据。在这个过程中，我不幸失去了很多变量。从我想要预测的 240 个变量中，我最终得到的结果只有 49 个。也许我是盲人，或者我不知道如何正确配置模型时间功能。我非常感谢一些帮助。提前致谢！

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(modeltime))

## create some senseless data to produce forecasts on...
dates <- ymd("2016-01-01")+ months(0:59)
fake_values <- 
  c(661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
    510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
    862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
    661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
    510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
    862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
    661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
    510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
    862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
    661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
    510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
    862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239)

replicate <- rep(1,60) %*% t.default(fake_values)
replicate <- as.data.frame(replicate)

df <- bind_cols(replicate, dates) %>%
  rename(c(dates = ...241))

## melt it down
data <- reshape2::melt(df, id.var='dates')

## make some senseless forecast on senseless data...
split_obj <- initial_time_split(data, prop = 0.8)  

model_fit_prophet <- prophet_reg() %>%
  set_engine(engine = "prophet") %>%
  fit(value ~ dates, data = training(split_obj))

## model table
models_tbl_prophet <- modeltime_table(model_fit_prophet)

## calibration
calibration_tbl_prophet <- models_tbl_prophet %>%
  modeltime_calibrate(new_data = testing(split_obj))

## forecast
fc_prophet <- calibration_tbl_prophet %>%
  modeltime_forecast(
    new_data = testing(split_obj),
    actual_data = data,
    keep_data = TRUE
  ) 

## "unmelt" that bastard again
fc_prophet <- fc_prophet %>% filter(str_detect(.key,  "prediction"))
fc_prophet <- fc_prophet[,c(4,9,10)]
fc_prophet <- dplyr::filter(fc_prophet, .index >= "2020-01-01", .index <= "2020-12-01")
#fc_prophet <- fc_prophet %>% subset(fc_prophet,  as.character(.index) >"2020-01-01" & as.character(.index)< "2020-12-01" )

fc_wide_prophet <- fc_prophet %>% 
  pivot_wider(names_from = variable, values_from = value)

【问题讨论】：

正确地说，只剩下 48 个变量。我想它有什么。与初始时间分割有关（80:20，240 的 20%（变量）= 48），但我仍然不确定这个问题

标签： r time-series tidyverse data-manipulation tidymodels

【解决方案1】：

这是我的完整解决方案。我还提供了我在这里所做的背景：https://github.com/business-science/modeltime/issues/133

suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(modeltime))
library(timetk)

## create some senseless data to produce forecasts on...
dates <- ymd("2016-01-01")+ months(0:59)
fake_values <- 
    c(661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
      510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
      862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
      661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
      510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
      862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
      661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
      510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
      862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239,
      661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366,
      510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733,
      862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239)

replicate <- rep(1,60) %*% t.default(fake_values)
replicate <- as.data.frame(replicate)

df <- bind_cols(replicate, dates) %>%
    rename(c(dates = ...241))

## melt it down
data <- reshape2::melt(df, id.var='dates')

data %>% as_tibble() -> data


data %>%
    filter(as.numeric(variable) %in% 1:9) %>%
    group_by(variable) %>%
    plot_time_series(dates, value, .facet_ncol = 3, .smooth = F)
    

## make some senseless forecast on senseless data...
split_obj <- initial_time_split(data, prop = 0.8)  

split_obj %>%
    tk_time_series_cv_plan() %>%
    plot_time_series_cv_plan(dates, value)


split_obj_2 <- time_series_split(data, assess = "1 year", cumulative = TRUE)

split_obj_2 %>%
    tk_time_series_cv_plan() %>%
    plot_time_series_cv_plan(dates, value)

model_fit_prophet <- prophet_reg() %>%
    set_engine(engine = "prophet") %>%
    fit(value ~ dates, data = training(split_obj))

## model table
models_tbl_prophet <- modeltime_table(model_fit_prophet)

## calibration
calibration_tbl_prophet <- models_tbl_prophet %>%
    modeltime_calibrate(new_data = testing(split_obj_2))

## forecast
fc_prophet <- calibration_tbl_prophet %>%
    modeltime_forecast(
        new_data = testing(split_obj_2),
        actual_data = data,
        keep_data = TRUE
    ) 

fc_prophet %>%
    filter(as.numeric(variable) %in% 1:9) %>%
    group_by(variable) %>%
    plot_modeltime_forecast(.facet_ncol = 3)

## "unmelt" that bastard again
# fc_prophet <- fc_prophet %>% filter(str_detect(.key,  "prediction"))
# fc_prophet <- fc_prophet[,c(4,9,10)]
# fc_prophet <- dplyr::filter(fc_prophet, .index >= "2020-01-01", .index <= "2020-12-01")
# #fc_prophet <- fc_prophet %>% subset(fc_prophet,  as.character(.index) >"2020-01-01" & as.character(.index)< "2020-12-01" )
# 
# fc_wide_prophet <- fc_prophet %>% 
#     pivot_wider(names_from = variable, values_from = value)


# Make a future forecast

refit_tbl_prophet <- calibration_tbl_prophet %>%
    modeltime_refit(data = data)

future_fc_prophet <- refit_tbl_prophet %>%
    modeltime_forecast(
        new_data = data %>% group_by(variable) %>% future_frame(.length_out = "1 year"),
        actual_data = data,
        keep_data = TRUE
    )

future_fc_prophet %>%
    filter(as.numeric(variable) %in% 1:9) %>%
    group_by(variable) %>%
    plot_modeltime_forecast(.facet_ncol = 3)

# Reformat as wide

future_wide_tbl <- future_fc_prophet %>%
    filter(.key == "prediction") %>%
    select(.model_id, .model_desc, dates, variable, .value) %>%
    pivot_wider(
        id_cols     = c(.model_id, .model_desc, dates),
        names_from  = variable, 
        values_from = .value
    )

future_wide_tbl[names(df)]

【讨论】：

非常感谢您的快速回复！我只是使用真实数据实施了解决方案。检查我所感知的结果，我让所有的专栏都知道，什么是伟大的。尽管如此，对于每个变量，我都得到了相同的预测结果（样本内外都一样）。在我上面的示例中，您无法观察到这种行为，因为我只是复制相同的变量来预测 240 次。我可以向您发送带有随机数据的示例代码来说明上述内容。问候
是的，您的预测很差，因为您使用的是先知。尝试使用时间序列签名功能的 xgboost。这对于使用单个全局模型对 1000 个时间序列进行预测来说非常快。我在我的模型时间课程中教授这个。 university.business-science.io/p/…
好吧，我将尝试一些不同的模型，比如 xgboost。再次感谢
亲爱的 Matt，感谢您提出从本地模型更改为全局模型的建议（效果很好）。尽管如此，我还是在同一日期开始的时间序列上进行了测试。如果我有 200 个不同观察开始的时间序列怎么办？如果我在本地预测它们，则无需将它们聚集成相同长度的时间序列块，因为我可以从确切的时间序列开始日期开始每个预测。我的猜测是，当我将不同长度的时间序列混合到一个全局模型中时，它不会给我令人满意的结果，我需要做很多长度聚类。我在吗？