基于自定义性能指标 R 的不平衡数据集的有效阈值优化答案

【问题标题】：Efficient threshold optimisation for imbalanced dataset based on custom performance metric R基于自定义性能指标 R 的不平衡数据集的有效阈值优化
【发布时间】：2020-09-04 13:58:37
【问题描述】：

我正在尝试找到一种有效的方法来根据 R 中的自定义性能指标为预测模型的概率分数设置分类阈值。值得注意的是，真实数据是不平衡的，并且在训练集中有超过 3500 万行.因此，这给出了大约 3500 万个预测分数，可以将其设置为两个类别的阈值分割。到目前为止，我已经尝试了两种方法

1. A 'smart', single thread approach trying to do minimal work 

2. A brute-force, parallel multi-threaded approach.

方法 1 的性能要好得多，见下文，但仍然太慢在真实数据上（运行 25 多个小时后我放弃了）。我的问题是，是否有人有更好的方法或知道对此有用的软件包？我查看了stackoverflow，找不到类似的东西。我认为我的第一种方法的一些并行版本将是最好的选择，但由于它依赖于最后一次迭代的结果，我认为这并不容易。

小数据的基准测试结果（1000 行，运行 100 次 & 50,000 行，运行 5 次）：

Unit: milliseconds
                              expr       min        lq      mean    median        uq       max neval
       minimal_single_thread(1000)  338.5525  366.5356  387.0256  384.0934  396.6146  714.5271 100
 brut_force_multi_thread(1000, 20) 6121.4523 6206.6340 6279.6554 6253.2492 6324.4614 6593.9065 100
   
  
Unit: seconds
                               expr       min        lq      mean    median        uq max   neval
       minimal_single_thread(50000)  20.45089  21.31735  21.41669  21.56343  21.78985 21.96191  5
 brut_force_multi_thread(50000, 20) 797.55525 797.60568 799.15903 797.73044 798.24058 804.66320 5

代码：首先是两种功能化方法

#1. A 'smart', single thread approach trying to do minimal work 

minimal_single_thread<-function(n){
  #create random predictions and observations i.e. the actuals
  set.seed(10001)
  comp <- data.table("pred"=runif(n),
                     "obs"=sample(0:1,n,replace=T))
  #put in order of increasing prediction score
  setorder(comp,pred)
  #create table to hold performance metrics
  optimum_threshold <- data.table("pred"=comp$pred)
  #Get the number of predictions at each unique predicition score 
  #necessary as two cases could have same score
  optimum_threshold <- optimum_threshold[, .(count = .N), by = pred]
  setorder(optimum_threshold,pred)
  #Add necessary columns
  optimum_threshold[,f_measure:=0.0]
  optimum_threshold[,TPR:=0.0]
  optimum_threshold[,f_measure_unadj:=0.0]
  optimum_threshold[,mcc:=0.0]
  #Get totals for correcting the values for adjusted f-measure metric 
  num_negatives <- nrow(comp[obs==0,])
  num_positives <- nrow(comp[obs==1,])
  # Loop through all possible values of the cut-off(threshold) and store the confusion matrix scores
  obs<-comp$obs
  #need to compute logical every time for fp as you pred all 1 at first and then change to 0
  comparison_fp_pred <- rep(1,length(obs))
  comparison_fp <- (comparison_fp_pred & !obs)
  #do need to for fn
  comparison_fn_pred <- !rep(1,length(obs))
  comparison_fn <- (comparison_fn_pred & obs)
  act_pos<-sum(obs)
  act_neg<-num_negatives
  #keep count of last position for updating comparison
  lst<-0L
  row_ind <- 1L
  for(pred_score_i in optimum_threshold$pred){
    #find out how many cases at the predicted score
    changed <- optimum_threshold[row_ind,count]
    #Update the cases that have changed to the opposite to what they were before 
    #i.e. the predicition was 1 before and now is 0 so if pred was false before now true and vice versa all rest stays the same
    comparison_fp_pred[(lst+1):(lst+changed)] <- !comparison_fp_pred[(lst+1):(lst+changed)] 
    comparison_fp[(lst+1):(lst+changed)] <- (comparison_fp_pred[(lst+1):(lst+changed)]& obs[(lst+1):(lst+changed)])
    #need to calc logic for fn
    comparison_fn_pred[(lst+1):(lst+changed)] <- !comparison_fn_pred[(lst+1):(lst+changed)] 
    comparison_fn[(lst+1):(lst+changed)] <- (comparison_fn_pred[(lst+1):(lst+changed)]& obs[(lst+1):(lst+changed)])
    FP <- as.double(sum(comparison_fp))
    FN <- as.double(sum(comparison_fn))
    TN <- act_neg - FP
    TP <- act_pos - FN
    if(is.na(TN)) TN <- 0
    if(is.na(TP)) TP <- 0
    if(is.na(FN)) FN <- 0
    if(is.na(FP)) FP <- 0
    TPR <- TP/(TP+FN)
    Precision <- TP/(TP+FP)  
    f1_unadj<-(2/((1/Precision)+(1/TPR)))
    #mcc
    MCC <- (TP*TN - FP*FN)/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    #for cases where precision or recall is 0 need to put 0 as total value to avoid math error
    if(is.na(MCC)) MCC <- 0
    TP_cor <- TP + num_positives*TPR
    TN_cor <- TN - num_positives*(1-TPR)
    FP_cor <- FP - num_positives*TPR
    FN_cor <- FN + num_positives*(1-TPR)
    TPR_cor <- TP_cor/(TP_cor+FN_cor)
    Precision_cor <- TP_cor/(TP_cor+FP_cor)  
    f1<-(2/((1/Precision_cor)+(1/TPR_cor)))
    #for cases where precision or recall is 0 need to put 0 as total value to avoid math error
    if(is.na(f1)) f1 <- 0
    set(optimum_threshold,i=row_ind,j="TPR",value=TPR)
    set(optimum_threshold,i=row_ind,j="f_measure_unadj",value=f1_unadj)
    set(optimum_threshold,i=row_ind,j="mcc",value=MCC)
    set(optimum_threshold,i=row_ind,j="f_measure",value=f1)
    #update references
    lst <- lst+changed
    row_ind <- row_ind+1L
  }
  # Threshold is the max adjusted f-measure
  setorder(optimum_threshold,-f_measure)
  threshold <- as.numeric(optimum_threshold[1,pred])
  return(list("threshold"=threshold))
}

#2. A brute-force, parallel multi-threaded approach. 

brut_force_multi_thread <-function(n,num_threads){
  #create random predictions and observations i.e. the actuals
  set.seed(10001)
  optimum_threshold <- data.table("pred"=runif(n),
                                  "obs"=sample(0:1,n,replace=T))
  #put in order of increasing prediction score - performance metrics will be held here
  setorder(optimum_threshold,pred)
  

  #Get totals for correcting the values for adjusted f-measure metric 
  act_neg <- nrow(optimum_threshold[obs==0,])
  act_pos <- nrow(optimum_threshold[obs==1,])
  num_cases <- as.integer(act_pos+act_neg)
  print(paste("Number of threads used",num_threads))
  cl <- makeCluster(num_threads)
  registerDoParallel(cl)
  cl_return <- foreach(row_ind = 1L:nrow(optimum_threshold),
                       .packages = c("data.table")) %dopar% {
                         FP <- nrow(optimum_threshold[(row_ind+1L):num_cases,][obs==0,])
                         FN <- sum(optimum_threshold[1L:row_ind,obs])
                         TN <- act_neg - FP
                         TP <- act_pos - FN
                         if(is.na(TN)) TN <- 0
                         if(is.na(TP)) TP <- 0
                         if(is.na(FN)) FN <- 0
                         if(is.na(FP)) FP <- 0
                         TPR <- TP/(TP+FN)
                         Precision <- TP/(TP+FP)  
                         f1_unadj<-(2/((1/Precision)+(1/TPR)))
                         #mcc
                         MCC <- (TP*TN - FP*FN)/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
                         #for cases where precision or recall is 0 need to put 0 as total value to avoid math error
                         if(is.na(MCC)) MCC <- 0
                         TP_cor <- TP + act_pos*TPR
                         TN_cor <- TN - act_pos*(1-TPR)
                         FP_cor <- FP - act_pos*TPR
                         FN_cor <- FN + act_pos*(1-TPR)
                         TPR_cor <- TP_cor/(TP_cor+FN_cor)
                         Precision_cor <- TP_cor/(TP_cor+FP_cor)  
                         f1<-(2/((1/Precision_cor)+(1/TPR_cor)))
                         #for cases where precision or recall is 0 need to put 0 as total value to avoid math error
                         if(is.na(f1)) f1 <- 0
                         loop_dt <- data.table("pred"=optimum_threshold[row_ind,pred],"f_measure"=f1,
                                               "TPR"=TPR,"f_measure_unadj"=f1_unadj,"mcc"=MCC)
                         return(loop_dt)
                       }
  #stop cluster
  stopCluster(cl)
  #Combine all - Get unique values
  optimum_threshold<-unique(rbindlist(cl_return))
  # Threshold is the max adjusted f-measure
  setorder(optimum_threshold,-f_measure)
  threshold <- as.numeric(optimum_threshold[1,pred])
  return(list("threshold"=threshold))
}

接下来进行比较，以确保从两种方法中获得相同的结果：

library(data.table)
library(parallel)
library(doParallel)
library(foreach)
minimal_single_thread_return <- minimal_single_thread(100)
brut_force_multi_thread_return <- brut_force_multi_thread(100,5)
print(brut_force_multi_thread_return)
$threshold
[1] 0.008086668

print(minimal_single_thread_return)
$threshold
[1] 0.008086668

最后对 1000 行数据集进行基准测试，运行 100 次和 50000 行 5 次：

library(microbenchmark)
res <- microbenchmark(minimal_single_thread(1000),
                      brut_force_multi_thread(1000,20),
                      times=100L)
print(res)

res <- microbenchmark(minimal_single_thread(50000),
                      brut_force_multi_thread(50000,20),
                      times=5L)
print(res)

【问题讨论】：

术语：多处理和多线程之间存在形式上的区别。您的并行化代码使用多个 R 进程，而不是多个线程，因此它是多处理的。多线程并行化只能在本机代码中完成 - 而不是使用 R 代码。在 R 级别完成的所有并行化都是多处理的。因此，如有疑问，请使用该术语。
请注意，data.table 在内部使用多线程并行化，因此当涉及到本地计算机上的并行化时，这些调用已经非常优化。
通过快速查看您的 for 循环，您可能可以通过矢量化计算来改进部分（甚至全部）。开始我的分析你的代码，以确定它花费大部分时间的地方； profvis::profvis() 对此非常有用。只有在优化了顺序代码之后，才应该考虑并行化。
@HenrikB 感谢有关多处理的指针和建议，我会调查一下！

标签： r performance parallel-processing prediction coding-efficiency

【解决方案1】：

因此，根据查看ROCR 包的建议，我找到了一个足够快的解决方案。我通过将预测和观察结果传递给prediciton() 来做到这一点，从中我得到了每个阈值选择的混淆表值（TP、FP、FN、TN）。从那里我刚刚计算了数据表中的所有性能指标。与之前在小型和大型数据集（1000 行，运行 100 次和 50,000 行，运行 5 次）上的最佳基准测试结果相比，结果有了很大改进：

Unit: milliseconds
expr                        min        lq         mean      median     uq        max     neval
minimal_single_thread(1000) 334.515352 340.666631 353.93399 353.564355 362.62567 413.33399 100
ROCR_approach(1000)         9.377623   9.662029   10.38566  9.924076   10.37494  27.81753  100


Unit: milliseconds
expr                         min         lq          mean        median      uq          max         neval
minimal_single_thread(50000) 20375.35368 20470.45671 20594.56010 20534.32357 20696.55079 20896.11574     5
ROCR_approach(50000)         53.12959    53.60932    62.02762    53.74342    66.47123       83.18456     5

ROCR函数：

ROCR_approach <-function(n){
  #create random predictions and observations i.e. the actuals
  set.seed(10001)
  optimum_threshold <- data.table("pred"=runif(n),
                                  "obs"=sample(0:1,n,replace=T))
  #put in order of increasing prediction score - performance metrics will be held here
  setorder(optimum_threshold,-pred)
  #Get totals for correcting the values for adjusted f-measure metric 
  act_neg <- nrow(optimum_threshold[obs==0,])
  act_pos <- nrow(optimum_threshold[obs==1,])
  num_cases <- as.integer(act_pos+act_neg)
  pred <- prediction(optimum_threshold$pred, optimum_threshold$obs)
  optimum_threshold[,TP:=unlist(..pred@tp)[-length(unlist(..pred@tp))]]#[-1]]
  optimum_threshold[,FP:=unlist(..pred@fp)[-length(unlist(..pred@tp))]]#[-1]]
  optimum_threshold[,TN:=unlist(..pred@tn)[-length(unlist(..pred@tp))]]#[-1]]
  optimum_threshold[,FN:=unlist(..pred@fn)[-length(unlist(..pred@tp))]]#[-1]]
  rm(pred)
  optimum_threshold[,TPR:=TP/(TP+FN)]
  optimum_threshold[,f_measure_unadj:=(2/((1/(TP/(TP+FP)))+(1/TPR)))]
  optimum_threshold[,mcc:= (TP*TN - FP*FN)/sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))]
  optimum_threshold[,f_measure:=(2/((1/((TP + ..act_pos*TPR)/((TP + ..act_pos*TPR)+(FP - ..act_pos*TPR))))+
                            (1/((TP + ..act_pos*TPR)/((TP + ..act_pos*TPR)+(FN + ..act_pos*(1-TPR)))))))]
  setorder(optimum_threshold,pred)
  #set all to null
  optimum_threshold[,obs:=NULL] 
  optimum_threshold[,TP:=NULL] 
  optimum_threshold[,FP:=NULL] 
  optimum_threshold[,TN:=NULL] 
  optimum_threshold[,FN:=NULL]
  #set any na's to 0
  for(col_i in seq_len(ncol(optimum_threshold)))  
    set(optimum_threshold,which(is.na(optimum_threshold[[col_i]])),col_i,0L)
  # Threshold is the max adjusted f-measure
  setorder(optimum_threshold,-f_measure)
  threshold <- as.numeric(optimum_threshold[1,pred])
  return(list("threshold"=threshold))
}

【讨论】：