【问题标题】:How to optimize batch forecasting如何优化批量预测
【发布时间】:2020-06-21 10:38:29
【问题描述】:

我遇到了 Joseph Owen 代码 here 用于批量预测。我有一个包含接近 19k 行的数据集,但问题是即使应用了批量预测方法,我的代码仍然运行得非常慢。

在进行实际预测之前,我需要评估我使用 MAPE 作为评估标准的最佳模型。以下是相同的可用代码 sn-p。我的问题是如何优化以下代码以使其在可接受的时间内运行(不到 2 分钟)

 fcnChooseETS <- function(Ts){
       
  TsPositive <- ( min( as.numeric(Ts) ) > 0 ) # Check if all values of timeseries are positive or not
  
  ModelsUsed <- c("ANN","MNN","ANA","AAN","AAA","MAA","MNM","MMN","MMM","MNA","MAN","MAM")
  ModelsNonPositive <- c("ANN","ANA","AAN","AAA") # Multiplicative models cannot take non positive data
  
  if( !TsPositive ){
    ModelsUsed <- ModelsNonPositive
  }
  
  lAllModels <- lapply(ModelsUsed, function(M){
    ets(Ts, damped = NULL, model = M)
  })
  
  vecResult <- sapply(lAllModels, function(M) accuracy(M)[2])
  
  names(vecResult) <- ModelsUsed
  min(vecResult)      
}  

    fcnTrending <- function( dt){
      Ts <- lapply(transpose(dt), ts , frequency = 12 , end = FeedDate)
      fit <- lapply(Ts , fcnChooseETS ) 
    }

【问题讨论】:

  • 您使用的是什么操作系统?您可能看到根据您的操作系统和机器将lapply() 切换为并行模拟的优势。我以前没有使用并行处理来运行模型,但这是我能想到的唯一其他选择。就像@Rui 说的,你的瓶颈几乎肯定是ets()

标签: r data.table forecasting


【解决方案1】:

以下脚本测试了在问题中拟合模型的 3 种不同方法。其中第一个是问题中发布的代码的更惯用的版本,接下来的两个并行适合多个模型。

此脚本保存在文件so_62497397.R 中并运行如下。

#
# filename: so_62497397.R
# Test serial and two types of parallel execution of
# exponential smoothing time series fitting.

library(parallel)
library(foreach)
library(doParallel)
library(forecast)

fcnChooseETS <- function(Ts){

  TsPositive <- ( min( as.numeric(Ts) ) > 0 ) # Check if all values of timeseries are positive or not

  ModelsUsed <- c("ANN","MNN","ANA","AAN","AAA","MAA","MNM","MMN","MMM","MNA","MAN","MAM")
  ModelsNonPositive <- c("ANN","ANA","AAN","AAA") # Multiplicative models cannot take non positive data

  if( !TsPositive ){
    ModelsUsed <- ModelsNonPositive
  }

  lAllModels <- lapply(ModelsUsed, function(M){
    ets(Ts, damped = NULL, model = M)
  })

  vecResult <- sapply(lAllModels, function(M) accuracy(M)[2])

  names(vecResult) <- ModelsUsed
  vecResult[which.min(vecResult)]
}
fcnChooseETS2 <- function(Ts, Ncpus = 2){

  TsPositive <- ( min( as.numeric(Ts) ) > 0 ) # Check if all values of timeseries are positive or not

  ModelsUsed <- c("ANN","MNN","ANA","AAN","AAA","MAA","MNM","MMN","MMM","MNA","MAN","MAM")
  ModelsNonPositive <- c("ANN","ANA","AAN","AAA") # Multiplicative models cannot take non positive data

  if( !TsPositive ){
    ModelsUsed <- ModelsNonPositive
  }

  vecResult <- mclapply(ModelsUsed, function(M){
    fit <- ets(Ts, damped = NULL, model = M)
    accuracy(fit)[2]
  }, mc.cores = Ncpus)

  vecResult <- unlist(vecResult)
  names(vecResult) <- ModelsUsed
  vecResult[which.min(vecResult)]
}

fcnChooseETS3 <- function(Ts, Ncpus = 2){

  TsPositive <- ( min( as.numeric(Ts) ) > 0 ) # Check if all values of timeseries are positive or not

  ModelsUsed <- c("ANN","MNN","ANA","AAN","AAA","MAA","MNM","MMN","MMM","MNA","MAN","MAM")
  ModelsNonPositive <- c("ANN","ANA","AAN","AAA") # Multiplicative models cannot take non positive data

  if( !TsPositive ){
    ModelsUsed <- ModelsNonPositive
  }

  cl <- makeCluster(Ncpus)
  clusterExport(cl, 'ts')
  clusterEvalQ(cl, library(forecast))
  vecResult <- parLapply(cl, ModelsUsed, function(M){
    fit <- ets(Ts, damped = NULL, model = M)
    accuracy(fit)[2]
  })
  stopCluster(cl)

  vecResult <- unlist(vecResult)
  names(vecResult) <- ModelsUsed
  vecResult[which.min(vecResult)]
}

makeTestdata <- function(N){
  n <- length(USAccDeaths)
  m <- ceiling(log2(N/n))
  x <- as.numeric(USAccDeaths)
  for(i in seq_len(m)) x <- c(x, x)
  L <- length(x)/12 - 1
  x <- ts(x, start = 2000 - L, frequency = 12)
  x
}


numCores <- detectCores()
cat("numCores:", numCores, "\n")

x <- makeTestdata(5e3)

t1 <- system.time(
  res1 <- fcnChooseETS(x)
)
t2 <- system.time(
  res2 <- fcnChooseETS2(x, Ncpus = numCores)
)
t3 <- system.time(
  res3 <- fcnChooseETS3(x, Ncpus = numCores)
)

rbind(t.lapply = t1,
      t.mclapply = t2,
      t.parLapply = t3)

c(res1, res2, res3)

打开Rscript 运行

  • 一台老化的 PC,处理器 Intel® Core™ i3 CPU 540 @ 3.07GHz × 4 核,
  • R 版本 4.0.2 (2020-06-22)
  • Ubuntu 20.04。

时间显示mclapply 是最好的选择,虽然并不比parLapply 快多少。在拟合的模型中,使用 MAPE 选择的模型都应该是一样的。

rui@rui:~$ Rscript --vanilla so_62497397.R
#Loading required package: iterators
#Registered S3 method overwritten by 'quantmod':
#  method            from
#  as.zoo.data.frame zoo 
#numCores: 4 
#            user.self sys.self elapsed user.child sys.child
#t.lapply       56.505    0.063  57.389      0.000      0.00
#t.mclapply      0.039    0.024  33.983     30.506      0.26
#t.parLapply     0.040    0.012  36.317      0.001      0.00
#     ANA      ANA      ANA 
#263.0876 263.0876 263.0876 

【讨论】:

  • 您好,感谢您的回复,我使用的是相同的方法,使用 lapply 而不是单独的语句,但即使这样,执行时间也非常长。
  • @RohitSaluja 时间一定要去ets。如果你要拟合这么多模型,那么没有解决办法,只能等待。
  • @RuiBarradas,只是一个选择。您可以将lapply() 换成sapply()simplify = FALSE 的模型,然后跳过重命名。
猜你喜欢
  • 1970-01-01
  • 2021-01-07
  • 2021-09-05
  • 1970-01-01
  • 2021-10-21
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2016-12-20
相关资源
最近更新 更多