【问题标题】:R data.table: row-based conditions split/apply/combineR data.table:基于行的条件拆分/应用/组合
【发布时间】:2018-10-28 15:38:21
【问题描述】:

我有以下data.table

initial.date <- as.POSIXct('2018-10-27 10:00:00',tz='GMT')
last.date <- as.POSIXct('2018-12-28 17:00:00',tz='GMT') 
    PriorityDateTime=seq.POSIXt(from=initial.date,to = last.date,by = '30 sec')
    TradePrice=seq(from=1, to=length(PriorityDateTime),by = 1)
    ndf<- data.frame(PriorityDateTime,TradePrice)
    ndf$InstrumentSymbol <- rep_len(x = c('asset1','asset2'),length.out = length(ndf$PriorityDateTime))
    ndf$id <- seq(1:length(x = ndf$InstrumentSymbol))
    ndf$datetime <- ymd_hms(ndf$PriorityDateTime)
    res <- ndf %>% data.table()

看起来像这样:

    > res
         PriorityDateTime TradePrice InstrumentSymbol   id            datetime
   1: 2018-10-27 10:00:00          1           asset1    1 2018-10-27 10:00:00
   2: 2018-10-27 10:00:30          2           asset2    2 2018-10-27 10:00:30
   3: 2018-10-27 10:01:00          3           asset1    3 2018-10-27 10:01:00
   4: 2018-10-27 10:01:30          4           asset2    4 2018-10-27 10:01:30
   5: 2018-10-27 10:02:00          5           asset1    5 2018-10-27 10:02:00

使用data.table 最优雅、最快捷的方式是:

  1. 拆分:对于每一行定义在过去或未来最多 60 秒(时间差小于 60 秒)具有 datetime 的其他行,并且具有相同的 InstrumentSymbol这条线的。
  2. 应用:在这些接近的行中,哪一个具有最接近该行的TradePrice[i]TradePrice:获取原始data.frame 中的index 和另一行的TradePrice
  3. 组合:将结果作为新列重新组合到原始 data.table 中,例如作为新列 index.minpricewithin60minpricewithin60

示例结果:

> res
         PriorityDateTime TradePrice InstrumentSymbol   id            datetime minpricewithin60 index.minpricewithin60
   1: 2018-10-27 10:00:00          1           asset1    1 2018-10-27 10:00:00                2                      2
   2: 2018-10-27 10:00:30          2           asset2    2 2018-10-27 10:00:30                4                      4
   3: 2018-10-27 10:01:00          3           asset1    3 2018-10-27 10:01:00                1                      1
   4: 2018-10-27 10:01:30          4           asset2    4 2018-10-27 10:01:30                2                      2
   5: 2018-10-27 10:02:00          5           asset1    5 2018-10-27 10:02:00                3                      3

base 中,我可以修复一行并将其用于条件。例如,如果我想获得第一个TradePrice,其中id 与该行的id 相同,我可以使用apply(df,1, function(x) df$TradePrice[which(df$id==x["id"])[1]])。您能否解释一下data.table 的连接(例如)如何实现相同的效果?

编辑:数据现在更大,我可以在不到 2.5 分钟的时间内在我的体面 PC(i7 4750 2B,12GB RAM)上运行的任何答案都将被考虑。干杯。

【问题讨论】:

    标签: r data.table


    【解决方案1】:

    OP 没有提及新数据集的大小。但是Rcpp 解决方案应该可以加快速度。

    根据之前的评论:

    mtd1 <- function() {
        ndf[, rn:=.I]
        iidx <- ndf[
            .(inst=InstrumentSymbol, prevMin=datetime-60L, nextMin=datetime+60L, idx=id, tp=TradePrice),
    
            .SD[id != idx, rn[which.min(abs(TradePrice - tp))]],
    
            by=.EACHI,
    
            on=.(InstrumentSymbol=inst, datetime>=prevMin, datetime<=nextMin)];
    
        ndf[, c("minpricewithin60", "index.minpricewithin60") := .SD[iidx$V1, .(TradePrice, id)]]
    }
    

    arg0naut 的做法:

    mtd2 <- function() {
        res2[, `:=` (min_60 = datetime - 60, plus_60 = datetime + 60, idx = .I)][
            res2,  on = .(InstrumentSymbol = InstrumentSymbol, datetime >= min_60, datetime <= plus_60), allow.cartesian = TRUE][
                idx != i.idx, .SD[which.min(abs(i.TradePrice - TradePrice))], by = id][
                    , .(id, minpricewithin60 = i.TradePrice, index.minpricewithin60 = i.idx)][
                        res, on = .(id)][, `:=` (min_60 = NULL, plus_60 = NULL, idx = NULL)]
    
    }
    

    一种可能的 Rcpp 方法:

    library(Rcpp)
    cppFunction('
    NumericVector nearestPrice(NumericVector id, NumericVector datetime, NumericVector price) {
        int i, j, n = id.size();
        NumericVector res(n);
        double prev, diff;
    
        for (i=0; i<n; i++) {
            prev = 100000;
    
            j = i-1;
            while (datetime[j] >= datetime[i]-60 && j>=0) {
                diff = std::abs(price[i] - price[j]);
    
                if (diff < prev) {
                    res[i] = id[j];
                    prev = diff;
                }
                j--;
            }
    
            j = i+1;
            while (datetime[j] <= datetime[i]+60 && j<=n) {
                diff = std::abs(price[i] - price[j]);
    
                if (diff < prev) {
                    res[i] = id[j];
                    prev = diff;
                }
                j++;
            }
        }
    
        return(res);
    }
    ')
    
    mtd3 <- function() {
        setorder(ndf2, InstrumentSymbol, PriorityDateTime)
        iidx <- ndf2[, nearestPrice(.I, datetime, TradePrice), by=.(InstrumentSymbol)]
        ndf2[, c("minpricewithin60", "index.minpricewithin60") := .SD[iidx$V1, .(TradePrice, id)]]
    }
    

    计时码:

    library(microbenchmark)
    microbenchmark(mtd1(), mtd2(), mtd3(), times=3L)
    

    时间安排:

    Unit: milliseconds
       expr         min          lq        mean      median          uq         max neval
     mtd1() 49447.09713 49457.12408 49528.14395 49467.15103 49568.66737 49670.18371     3
     mtd2() 64189.67241 64343.67138 64656.40058 64497.67034 64889.76466 65281.85899     3
     mtd3()    17.33116    19.58716    22.36557    21.84316    24.88277    27.92238     3
    

    数据:

    set.seed(0L)
    initial.date <- as.POSIXct('2018-01-01 00:00:00', tz='GMT')
    last.date <- initial.date + 30 * (180000/2)
    PriorityDateTime <- seq.POSIXt(from=initial.date, to=last.date, by='30 sec')
    
    library(data.table)
    ndf <- data.table(PriorityDateTime=c(PriorityDateTime, PriorityDateTime),
        TradePrice=rnorm(length(PriorityDateTime)*2, 100, 20),
        InstrumentSymbol=rep(c('asset1','asset2'), each=length(PriorityDateTime)),
        datetime=c(PriorityDateTime, PriorityDateTime))
    setorder(ndf, InstrumentSymbol, PriorityDateTime)[, id := .I]
    res <- copy(ndf)
    res2  <- copy(ndf)
    ndf2 <- copy(ndf)
    

    【讨论】:

    • 非常感谢您的回答,将尝试一下。作为一个附带问题,您是否知道是否也可以包含一些已编译的 c++ 代码?
    • 也许你可以在 Rcpp 下发布一个问题。 Rcpp 专家应该可以为您解答
    • 使用 originql 数据集,这不会返回正确的解决方案,因为第一行应该链接到价格为 3 的第三个索引。我猜这只是函数中的错误?然而,节省的时间太棒了...&gt; head(mtd3()) PriorityDateTime TradePrice InstrumentSymbol id datetime minpricewithin60 index.minpricewithin60 1: 2018-10-27 10:00:00 1 asset1 1 2018-10-27 10:00:00 5 5
    • @gpier 我的 id 恰好也是行号。改变是。我应该工作
    【解决方案2】:

    这可能有效:

    res <- res[1:5,]
    
    res2 <- setDT(res)
    res2 <- res2[, `:=` (min_60 = datetime - 60, plus_60 = datetime + 60, idx = .I)][
    res2,  on = .(InstrumentSymbol = InstrumentSymbol, datetime >= min_60, datetime <= plus_60), allow.cartesian = TRUE][
    idx != i.idx, .SD[which.min(abs(i.TradePrice - TradePrice))], by = id][
    , .(id, minpricewithin60 = i.TradePrice, index.minpricewithin60 = i.idx)][
    res, on = .(id)][, `:=` (min_60 = NULL, plus_60 = NULL, idx = NULL)]
    
    res2[]
    
    
       id minpricewithin60 index.minpricewithin60    PriorityDateTime TradePrice InstrumentSymbol            datetime
    1:  1                3                      3 2018-10-27 10:00:00          1           asset1 2018-10-27 10:00:00
    2:  2                4                      4 2018-10-27 10:00:30          2           asset2 2018-10-27 10:00:30
    3:  3                1                      1 2018-10-27 10:01:00          3           asset1 2018-10-27 10:01:00
    4:  4                2                      2 2018-10-27 10:01:30          4           asset2 2018-10-27 10:01:30
    5:  5                3                      3 2018-10-27 10:02:00          5           asset1 2018-10-27 10:02:00
    

    【讨论】:

    • 太好了,非常感谢!您使用 which.min(i.idx) 而不是 which.min(i.TradePrice) 有什么特别的原因吗?我查了一下,结果是一样的。另一个问题:在这条链[res2, on = .(InstrumentSymbol = InstrumentSymbol, datetime &gt;= min_60, datetime &lt;= plus_60), allow.cartesian = TRUE] 之后,我剩下3 列与datetime 有某种关联:我有datetimedatetime.1i.datetime,你能解释一下它们代表什么吗?
    • 其实,我想我看错了你的说明:在这种情况下,应该是.SD[which.min(abs(i.TradePrice - TradePrice))]。我已经纠正了这一点,现在它应该只返回最接近的交易价格 - 在你的例子中它并不重要,因为它们之间的差异是相等的。您提到的列是datetime 上非等值连接的结果。一列用于所有加入并满足条件&gt;= 的日期,另一列用于所有&lt;=
    • 如果您想尝试stackoverflow.com/questions/53035534/…,请查看 dplyr 这个问题
    • 检查后还是很慢,只是比base R快一点,data.table有没有更快的方法?
    • 你需要运行 setDT(ndf)
    【解决方案3】:

    我已经分解了代码,以便更容易查看正在发生的事情并进行故障排除。真的,这只是最后一行需要任何时间。我还让价格数据变得更有趣和可测试。它在我的笔记本电脑上运行大约 1.3 分钟。

    library(data.table)
    library(lubridate)
    
    set.seed(1)
    initial.date <- as.POSIXct('2018-10-27 10:00:00',tz='GMT')
    last.date <- as.POSIXct('2018-12-28 17:00:00',tz='GMT') 
    PriorityDateTime=seq.POSIXt(from=initial.date,to = last.date,by = '30 sec')
    TradePrice=runif(length(PriorityDateTime))
    ndf<- data.frame(PriorityDateTime,TradePrice)
    ndf$InstrumentSymbol <- rep_len(x = c('asset1','asset2'),length.out = length(ndf$PriorityDateTime))
    ndf$id <- seq(1:length(x = ndf$InstrumentSymbol))
    ndf$datetime <- ymd_hms(ndf$PriorityDateTime)
    setDT(ndf)
    
    # Relevant Code
    
    # Setup (Trivial Runtime):
    ndf[, datetime_max := datetime + 60]
    ndf[, datetime_min := datetime - 60]
    
    ndf_x <- copy(ndf)
    ndf_y <- copy(ndf)
    
    names(ndf_x) <- paste0(names(ndf),"_x")
    names(ndf_y) <- paste0(names(ndf),"_y")
    
    ndf_join <- ndf_x[ndf_y,on = .(InstrumentSymbol_x = InstrumentSymbol_y, datetime_x >= datetime_min_y, datetime_x <= datetime_max_y), mult = "all", allow.cartesian = TRUE]
    ndf_join <- ndf_join[id_x != id_y]
    
    ndf_join[, price_delta := abs(TradePrice_y - TradePrice_x)]
    

    这是最耗时的代码:

    # Harworking Runtime:
    time_now <- Sys.time()
    ndf_out <- ndf_join[,.SD[which.min(price_delta), .(which_price = id_x, what_price = TradePrice_x)], 
                          by = .(PriorityDateTime_y,TradePrice_y, id_y, InstrumentSymbol_x, datetime_y)]
    cat(Sys.time() - time_now)
    # 1.289397
    

    输出:

    ndf_out
             PriorityDateTime_y TradePrice_y   id_y InstrumentSymbol_x          datetime_y which_price what_price
         1: 2018-10-27 10:00:00   0.26550866      1             asset1 2018-10-27 10:00:00           3 0.57285336
         2: 2018-10-27 10:00:30   0.37212390      2             asset2 2018-10-27 10:00:30           4 0.90820779
         3: 2018-10-27 10:01:00   0.57285336      3             asset1 2018-10-27 10:01:00           1 0.26550866
         4: 2018-10-27 10:01:30   0.90820779      4             asset2 2018-10-27 10:01:30           6 0.89838968
         5: 2018-10-27 10:02:00   0.20168193      5             asset1 2018-10-27 10:02:00           3 0.57285336
        ---                                                                                                      
    179397: 2018-12-28 16:58:00   0.54342007 179397             asset1 2018-12-28 16:58:00      179395 0.55391579
    179398: 2018-12-28 16:58:30   0.25181676 179398             asset2 2018-12-28 16:58:30      179400 0.28088354
    179399: 2018-12-28 16:59:00   0.08879969 179399             asset1 2018-12-28 16:59:00      179401 0.19670841
    179400: 2018-12-28 16:59:30   0.28088354 179400             asset2 2018-12-28 16:59:30      179398 0.25181676
    179401: 2018-12-28 17:00:00   0.19670841 179401             asset1 2018-12-28 17:00:00      179399 0.08879969
    

    【讨论】:

    • 非常感谢您的回答,让自己熟悉这些类型的联接
    【解决方案4】:

    对目前提出的不同解决方案进行基准测试(作为基准,我的基本 R 方法使用此数据大约需要 55 分钟):

    library(microbenchmark)
    microbenchmark(Chris(),
                   chinsoon12.cpp(),
                   arg0naut(),
                   chinsoon12.data.table(), times=3L)
    

    这是使用规格 i5-6500T @ 2.50GHz 和 8GB RAM 完成的。

    > tm
    Unit: milliseconds
                        expr         min          lq        mean     median          uq         max neval  cld
                     Chris() 95605.92838 95674.46039 96735.74794 95742.9924 97300.65772 98858.32305     3    d
            chinsoon12.cpp()    22.69009    23.07224    23.32106    23.4544    23.63655    23.81871     3 a   
                  arg0naut() 84848.28652 85555.15312 86985.39963 86262.0197 88053.95619 89845.89267     3   c 
     chinsoon12.data.table() 66327.23992 66838.09245 67695.28538 67348.9450 68379.30811 69409.67124     3  b  
    

    我知道这个问题与data.table 有关,但考虑到 Rcpp 方法快 2886.251 倍,我将奖励这个解决方案。非常感谢

    完整代码:

    library(Rcpp)
    library(data.table)
    initial.date <- as.POSIXct('2018-10-27 10:00:00',tz='GMT')
    last.date <- as.POSIXct('2018-12-28 17:00:00',tz='GMT') 
    PriorityDateTime=seq.POSIXt(from=initial.date,to = last.date,by = '30 sec')
    TradePrice=seq(from=1, to=length(PriorityDateTime),by = 1)
    ndf<- data.frame(PriorityDateTime,TradePrice)
    ndf$InstrumentSymbol <- rep_len(x = c('asset1','asset2'),length.out = length(ndf$PriorityDateTime))
    ndf$id <- seq(1:length(x = ndf$InstrumentSymbol))
    ndf$datetime <- ymd_hms(ndf$PriorityDateTime)
    res <- ndf %>% data.table()
    res2 <- res
    setDT(ndf)
    ndf2 <- ndf
    chinsoon12.data.table <- function() {
      ndf[, rn:=.I]
      iidx <- ndf[
        .(inst=InstrumentSymbol, prevMin=datetime-60L, nextMin=datetime+60L, idx=id, tp=TradePrice),
    
        .SD[id != idx, rn[which.min(abs(TradePrice - tp))]],
    
        by=.EACHI,
    
        on=.(InstrumentSymbol=inst, datetime>=prevMin, datetime<=nextMin)];
    
      ndf[, c("minpricewithin60", "index.minpricewithin60") := .SD[iidx$V1, .(TradePrice, id)]]
    }
    
    arg0naut <- function() {
      res2[, `:=` (min_60 = datetime - 60, plus_60 = datetime + 60, idx = .I)][
        res2,  on = .(InstrumentSymbol = InstrumentSymbol, datetime >= min_60, datetime <= plus_60), allow.cartesian = TRUE][
          idx != i.idx, .SD[which.min(abs(i.TradePrice - TradePrice))], by = id][
            , .(id, minpricewithin60 = i.TradePrice, index.minpricewithin60 = i.idx)][
              res, on = .(id)][, `:=` (min_60 = NULL, plus_60 = NULL, idx = NULL)]
    }
    
    cppFunction('NumericVector nearestPrice(NumericVector id, NumericVector datetime, NumericVector price) {
                int i, j, n = id.size();
                NumericVector res(n);
                double prev, diff;
    
                for (i=0; i<n; i++) {
                prev = 100000;
    
                j = i-1;
                while (datetime[j] >= datetime[i]-60 && j>=0) {
                diff = std::abs(price[i] - price[j]);
    
                if (diff < prev) {
                res[i] = id[j];
                prev = diff;
                }
                j--;
                }
    
                j = i+1;
                while (datetime[j] <= datetime[i]+60 && j<=n) {
                diff = std::abs(price[i] - price[j]);
    
                if (diff < prev) {
                res[i] = id[j];
                prev = diff;
                }
                j++;
                }
                }
    
                return(res);
                }')
    chinsoon12.cpp <- function() {
      setorder(ndf2, InstrumentSymbol, PriorityDateTime)
      iidx <- ndf2[, nearestPrice(.I, datetime, TradePrice), by=.(InstrumentSymbol)]
      ndf2[, c("minpricewithin60", "index.minpricewithin60") := .SD[iidx$V1, .(TradePrice, id)]]
    }
    
    
    
    
    # Setup (Trivial Runtime):
    Chris <- function() {
    ndf[, datetime_max := datetime + 60]
    ndf[, datetime_min := datetime - 60]
    ndf_x <- copy(ndf)
    ndf_y <- copy(ndf)
    names(ndf_x) <- paste0(names(ndf),"_x")
    names(ndf_y) <- paste0(names(ndf),"_y")
    ndf_join <- ndf_x[ndf_y,on = .(InstrumentSymbol_x = InstrumentSymbol_y, datetime_x >= datetime_min_y, datetime_x <= datetime_max_y), mult = "all", allow.cartesian = TRUE]
    ndf_join <- ndf_join[id_x != id_y]
    ndf_join[, price_delta := abs(TradePrice_y - TradePrice_x)]
    # Harworking Runtime:
    time_now <- Sys.time()
    ndf_out <- ndf_join[,.SD[which.min(price_delta), .(which_price = id_x, what_price = TradePrice_x)], 
                        by = .(PriorityDateTime_y,TradePrice_y, id_y, InstrumentSymbol_x, datetime_y)]
    }
    
    
    
    
    library(microbenchmark)
    tm <- microbenchmark(Chris(),
                   chinsoon12.cpp(),
                   arg0naut(),
                   chinsoon12.data.table(), times=3L)
    ggplot2::autoplot(tm[c(2:4),])
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 2012-07-02
      • 1970-01-01
      • 2017-10-01
      • 1970-01-01
      • 2013-08-09
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多