使用 R 中的 data.table 有效地重塑预测数据答案

【问题标题】：reshaping prediction data efficiently using data.table in R使用 R 中的 data.table 有效地重塑预测数据
【发布时间】：2019-02-05 07:21:04
【问题描述】：

我正在寻找一种更有效的方式来重塑 R 中的 data.table 数据。

目前我正在循环执行多个时间序列预测的重塑。

我得到了我所追求的正确答案，但是觉得该方法非常不雅/（un-data.table）。因此，我正在寻找 SO 社区，看看是否有更优雅的解决方案。

请参阅下面的数据设置以及获得所需答案的两次尝试。

# load libraries
require(data.table)
require(lubridate)


# set up data assumptions
id_vec <- letters
len_id_vec <- length(id_vec)
num_orig_dates <- 7
set.seed(123)


# create original data frame
orig <- data.table(ID=rep(id_vec,each=num_orig_dates),
                   date=rep(c(Sys.Date() %m+% months(0: (num_orig_dates-1))),times=len_id_vec),
                   most_recent_bal=unlist(lapply(round(runif(len_id_vec)*100),function(y){
                     y*cumprod(1+rnorm(num_orig_dates,0.001,0.002))})))


# add 24 months ahead predictions of balances using a random walk from the original dates
nrow_orig <- nrow(orig)

for(i in seq(24)){
  orig[,paste0('pred',i,'_bal'):=most_recent_bal*(1+rnorm(nrow_orig,0.001,0.003))]
  orig[,paste0('pred',i,'_date'):=date %m+% months(i)]
}


# First attempt
t0 <- Sys.time()
tmp1 <- rbindlist(lapply(unique(orig$ID),function(x){
  orig1 <- orig[ID==x,]

  bal_cols <- c('most_recent_bal',paste0('pred',seq(24),'_bal'))
  date_cols <- c('date',paste0('pred',seq(24),'_date'))

  # Go through each original date to realign predicted date and predicted balance
  date_vec <- orig1$date
  tmp <- rbindlist(lapply(date_vec,function(y){

    tmp <- data.table(dates=as.Date(as.vector(t(orig1[date==y,date_cols,with=FALSE]))),
                      bals=as.vector(t(orig1[date==y,bal_cols,with=FALSE])))
    tmp[,type:='prediction']
    tmp[,date_prediction_run:=y]

    # collect historical information too for plotting perposes.
    tmp1 <- orig1[date<=y,c('date','most_recent_bal'),with=FALSE]
    if(nrow(tmp1)!=0){

      setnames(tmp1,c('date','most_recent_bal'),c('dates','bals'))
      tmp1[,type:='history']
      tmp1[,date_prediction_run:=y]

      tmp <- rbind(tmp,tmp1)

    }

    tmp
  }))
  tmp[,ID:=x]
}))
t1 <- Sys.time()
t1-t0 #Time difference of 1.117216 secs

# Second Attempt: a slightly more data.table way which is faster but still very inelegant....
t2 <- Sys.time()
bal_cols <- c('most_recent_bal',paste0('pred',seq(24),'_bal'))
date_cols <- c('date',paste0('pred',seq(24),'_date'))
tmp1a <- rbindlist(lapply(unique(orig$ID),function(x){
  tmp <- cbind(melt(orig[ID==x,c('date',bal_cols),with=FALSE],
                    measure.vars = bal_cols,value.name='bals')[,-('variable'),with=FALSE],
               data.table(dates=melt(orig[ID==x,date_cols,with=FALSE],
                                     measure.vars = date_cols)[,value]))
  setnames(tmp,'date','date_prediction_run')
  tmp[,type:='prediction']

  tmp1 <- orig[ID==x,orig[ID==x & date<=.BY,c('date','most_recent_bal'),with=FALSE],by=date]
  setnames(tmp1,c('date_prediction_run','dates','bals'))
  tmp1[,type:='history']
  setcolorder(tmp1,colnames(tmp1)[match(colnames(tmp),colnames(tmp1))])
  tmp <- rbind(tmp,tmp1)
  tmp[,ID:=x]
  tmp
}))
t3 <- Sys.time()
t3-t2 # Time difference of 0.2309799 secs

【问题讨论】：

对我来说似乎是一个代码审查问题。
您可以极大地改进这个问题（并且可能会增加愿意尝试回答的 SO 用户的数量），说明您希望创建和组合的两组数据，例如，“1”我需要将 pred#_bal 和 pred#_date 列转换为长格式，将 ID 和日期作为关键字段 - 并在某个时候添加一个新列 type="prediction" "2) 从 orig [在此处的输出中列出列，并且用非常简短的术语来说，每列来自什么，而没有规定如何到达那里的所有步骤]

标签： r data.table

【解决方案1】：

我为此使用dplyr 和reshape2 进行了尝试，我觉得它稍微优雅一些（不是apply，而是technically for loops）。它还减少了大约 0.04 秒的运行时间。

t0 = Sys.time()

# Extract predicted values in long form
trial_bal = reshape2::melt(orig, id.vars = c("ID", "date"), measure.vars = 
c(colnames(orig)[grep("pred[0-9]{1,}_bal", colnames(orig))]))
colnames(trial_bal) = c("ID", "date_prediction_run", "type", "balance")
trial_bal$type = gsub("_bal", "", trial_bal$type)

trial_date = reshape2::melt(orig, id.vars = c("ID", "date"), measure.vars = 
c(colnames(orig)[grep("pred[0-9]{1,}_date", colnames(orig))]))
colnames(trial_date) = c("ID", "date_prediction_run", "type", "dates")
trial_date$type = gsub("_date", "", trial_date$type)

trial = merge.data.frame(trial_date, trial_bal, by = c("ID", "date_prediction_run", "type"))
trial$type = "prediction"
trial = trial %>% select(dates, balance, type, date_prediction_run, ID)

# Extract historical values in long form
temp = orig[, c("ID", "date", "most_recent_bal")]
temp = merge(temp[, c("ID", "date")], temp, by = "ID", allow.cartesian = TRUE)
temp = temp[temp$date.x >= temp$date.y, ]
temp$type = "history"
temp = temp %>% select(dates = date.y, balance = most_recent_bal, type, 
date_prediction_run = date.x, ID)

# Combine prediction and history
trial = rbind(trial, temp)
trial = trial %>% arrange(ID, date_prediction_run, desc(type), dates)

t1 = Sys.time()
t1 - t0 #Time difference of 0.1900001 secs

这比你的行数少 182 行，因为你有两次 dates = date_prediction_run - 一个在 type prediction 之下，一个在 history 之下。

【讨论】：

【解决方案2】：

根据 OP 的请求使用 data.table。

首先，仅展示如何逐步构建`data.table` 解决方案

即分解我们正在做的事情，并且仅仅为了这第一遍，是可读的。注：之后，在下面，（在不久之后的更新中）我将通过将所有内容放在一起来优化解决方案，例如通过组合步骤、链接、就地分配等。如您所料，如果不了解此处首先介绍的分步说明，则更优化的解决方案的可读性将大大降低，目的是向人们展示如何学习 data.table他们可能会找到解决方案。

# First Pass = Step-by-step (not optimized) just first work out a solution 

library(data.table)

# Transform prediction data from `orig` data.table into long format
# i.e. by melting pred#_bal and pred#_date columns
pred_data <- 
  data.table::melt( orig, 
                    measure = patterns("pred[0-9]+_bal", "pred[0-9]+_date"),  
                    value.name =     c("bals",           "date_prediction_run"))

pred_data[, type := "prediction"]  # add the 'type' column to pred_data (all are type="prediction")

# select desired columns in order
pred_data <- pred_data[, .( dates=date, bals, type, date_prediction_run, ID)] 


# Collect historical information from the most_recent_bal column, 
# which the OP wants for plotting purposes

graph_data <- 
  orig[ orig, 
        .(ID, dates=date, bals=most_recent_bal, date_prediction_run=x.date),
        on=.(ID, date>=date)]

graph_data[, type := "history"]  # these are all type="history" 

# final output, combining the prediction data and the graph data:
output <- rbindlist(list(pred_data, graph_data), use.names=TRUE)

更新 3 = 重要提示：下面的代码对提高速度没有任何作用！

下面是我的“通过组合一些步骤和链接进行优化的第一步”然而，即使下面我已经组合了一些步骤，使用了链接，它看起来又好又短，代码下面并不比上面的原始分步解决方案快，因为我将在帖子末尾显示基准时间。我将留下下面的代码，因为它说明了一个很好的观点并提供了一个学习机会。

首先通过结合一些步骤和链接进行优化[不是更快！]

library(data.table)

# Transform prediction data into long format
# by melting pred#_bal and pred#_date columns
pred_data <- 
  data.table::melt( orig[, type := "prediction"],  #add the type column to orig, before melting 
                    measure = patterns("pred[0-9]+_bal", "pred[0-9]+_date"),  
                    value.name =     c("bals",           "date_prediction_run")
                  )[, .( dates=date, bals, type, date_prediction_run, ID)] # chain, to select desired columns in order


# FINAL RESULT:  rbindlist pred_data to historic data
pred_data <- 
  rbindlist( list( pred_data, orig[ orig[, type := "history"],  
                                    .(dates=date, bals=most_recent_bal, type, date_prediction_run=x.date, ID),
                                    on=.(ID, date>=date)]
                 ), 
             use.names=TRUE)

继续更新 3：

使用非常方便的microbenchmark 包测试时间：

Unit: milliseconds
                expr         min          lq        mean      median          uq         max neval
 h.l.m_first_attempt 1140.017957 1190.818176 1249.499493 1248.977454 1299.497679 1427.632140   100
h.l.m_second_attempt  231.380930  239.513223  254.702865  249.735005  262.516276  375.762675   100
  krads_step.by.step    2.855509    2.985509    3.289648    3.059481    3.269429    6.568006   100
     krads_optimized    2.909343    3.073837    3.555803    3.150584    3.554100   12.521439   100

基准测试结果显示 data.table 解决方案是 OP 解决方案的巨大时序改进。太好了，这就是我们所要求的：我们已经展示了 data.table 的速度有多快，但我也希望它简单易读！但是，不要错过这里的另一课：

查看微基准测试结果，注意我的两个解决方案实际上是如何在相同的时间。起初这可能没有意义：为什么我的“逐步”解决方案与我尝试的“优化”解决方案一样快，有这么多代码行？

答案：如果您仔细观察，我的两个解决方案中都会出现所有相同的步骤。在我的“优化”解决方案中，是的，我们正在链接，您可能一开始会认为做的作业比“逐步”字面说明的要少。但是，正如基准测试结果应该告诉你的那样，我们没有做更少的任务！ IE。在我们使用[] 将另一个操作“链接”在一起的每个点上，它实际上等同于使用<- 分配回您的原始DT。

如果你能清楚地知道，你将走上更好的编程之路：你可以自信地跳过“链接”这一步，而是使用<- 来逐步说明（更具可读性，更容易调试和更易于维护）解决方案！

您可以节省时间的地方在于寻找不会在循环或应用操作中不必要地分配多次的地方。但我认为这是另一篇文章的主题！

注意如果您想在自己的代码中使用microbenchmark，我所做的就是：

library(microbenchmark)
mbm <- microbenchmark(
  h.l.m_first_attempt = {
    # Pasted in h.l.m's first solution, here
  },

  h.l.m_second_attempt = {
    # Pasted in h.l.m's second solution, here
  },

  krads_step.by.step = {
    # Pasted in my first solution, here
  },

  krads_optimized = {
    # Pasted in my second solution, here
  },
  times = 100L
)
mbm

如果您想要图表，请遵循：

library(ggplot2)
autoplot(mbm)

【讨论】：

看来，在上述解决方案中不需要设置键，因为1）设置键有一些成本（创建索引然后对其进行排序）和2）我们没有做足够多的重复子集，它会产生差异......而且3）因为on争论自动为on争论中的列创建索引。见：Secondary indices and auto indexing

根据 OP 的请求使用 data.table。

首先，仅展示如何逐步构建data.table 解决方案

更新 3 = 重要提示：下面的代码对提高速度没有任何作用！

继续更新 3：

首先，仅展示如何逐步构建`data.table` 解决方案