【问题标题】:r efficient way to calculate the difference between rowsr 计算行之间差异的有效方法
【发布时间】:2019-01-31 22:01:00
【问题描述】:

考虑到这是我下面的数据集

 Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
          5.7         2.5          5.0         2.0  virginica
          7.7         3.0          6.1         2.3  virginica
          6.7         3.3          5.7         2.1  virginica
          4.8         3.0          1.4         0.1     setosa
          5.5         4.2          1.4         0.2     setosa
          4.9         3.6          1.4         0.1     setosa
          6.3         3.3          4.7         1.6 versicolor
          5.6         2.9          3.6         1.3 versicolor
          5.9         3.0          4.2         1.5 versicolor


df <- structure(list(Sepal.Length = c(5.7, 7.7, 6.7, 4.8, 5.5, 4.9, 
    6.3, 5.6, 5.9), Sepal.Width = c(2.5, 3, 3.3, 3, 4.2, 3.6, 3.3, 
    2.9, 3), Petal.Length = c(5, 6.1, 5.7, 1.4, 1.4, 1.4, 4.7, 3.6, 
    4.2), Petal.Width = c(2, 2.3, 2.1, 0.1, 0.2, 0.1, 1.6, 1.3, 1.5
    ), Species = structure(c(3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("setosa", 
    "versicolor", "virginica"), class = "factor")), row.names = c(NA, 
    -9L), class = "data.frame")

我的目标是

  1. 从第一行 Species == "virginica" 中减去 Sepal.Length Sepal.Width Petal.Length Petal.Width 的值与每一行 "Setosa",

    我在下面这样做

    Virginia1_vs_Setosa1a <- df[1:4][df$Species == "virginica",][1,] - df[1:4][df$Species == "setosa",][1,]
    Virginia1_vs_Setosa1a 
        0.9        -0.5          3.6         1.9
    Virginia1_vs_Setosa2a <- df[1:4][df$Species == "virginica",][1,] - df[1:4][df$Species == "setosa",][2,]
    Virginia1_vs_Setosa2a
        0.2        -1.7          3.6         1.8
    Virginia1_vs_Setosa3a <- df[1:4][df$Species == "virginica",][1,] - df[1:4][df$Species == "setosa",][3,]
    Virginia1_vs_Setosa3a
        0.8        -1.1          3.6         1.9
    
  2. 取每个元素的乘积

      Virginia1_vs_Setosa1 <-  as.numeric(
                         Virginia1_vs_Setosa1a[1]*Virginia1_vs_Setosa1a[2]*
                          Virginia1_vs_Setosa1a[3]*Virginia1_vs_Setosa1a[4])
                          0.9*-0.5*3.6*1.9 = -3.078
      Virginia1_vs_Setosa2  <- as.numeric(
                         Virginia1_vs_Setosa2a[1]*Virginia1_vs_Setosa2a[2]*
                          Virginia1_vs_Setosa2a[3]*Virginia1_vs_Setosa2a[4])
                          0.2*-1.7*3.6*1.8 = -2.2032
      Virginia1_vs_Setosa3  <- as.numeric(
                         Virginia1_vs_Setosa3a[1]*Virginia1_vs_Setosa3a[2]*
                          Virginia1_vs_Setosa3a[3]*Virginia1_vs_Setosa3a[4])
                          0.8*-1.1*3.6*1.9 = -6.0192
    

Virginica 的第 2 行与 setosa 的每一行类似。

      Virginia2_vs_Setosa1a <- df[1:4][df$Species == "virginica",][2,] - df[1:4][df$Species == "setosa",][1,]

      Virginia2_vs_Setosa2a <- df[1:4][df$Species == "virginica",][2,] - df[1:4][df$Species == "setosa",][2,]

      Virginia2_vs_Setosa3a <-  df[1:4][df$Species == "virginica",][2,] - df[1:4][df$Species == "setosa",][3,]

      Virginia2_vs_Setosa1 <-  as.numeric(
              Virginia2_vs_Setosa1a[1]*Virginia2_vs_Setosa1a[2]*
              Virginia2_vs_Setosa1a[3]*Virginia2_vs_Setosa1a[4])

      Virginia2_vs_Setosa2  <- as.numeric(
              Virginia2_vs_Setosa2a[1]*Virginia2_vs_Setosa2a[2]*
              Virginia2_vs_Setosa2a[3]*Virginia2_vs_Setosa2a[4])

      Virginia2_vs_Setosa3  <- as.numeric(
              Virginia2_vs_Setosa3a[1]*Virginia2_vs_Setosa3a[2]*
              Virginia2_vs_Setosa3a[3]*Virginia2_vs_Setosa3a[4])

              rm(Virginia2_vs_Setosa1a, Virginia2_vs_Setosa2a, 
              Virginia2_vs_Setosa3a)

弗吉尼亚州的第三行与 setosa 的每一行类似

       Virginia3_vs_Setosa1a <- df[1:4][df$Species == "virginica",][3,] - df[1:4][df$Species == "setosa",][1,]

       Virginia3_vs_Setosa2a <- df[1:4][df$Species == "virginica",][3,] - df[1:4][df$Species == "setosa",][2,]

       Virginia3_vs_Setosa3a <-  df[1:4][df$Species == "virginica",][3,] - df[1:4][df$Species == "setosa",][3,]

       Virginia3_vs_Setosa1 <-  as.numeric(
                 Virginia3_vs_Setosa1a[1]*Virginia3_vs_Setosa1a[2]*
                 Virginia3_vs_Setosa1a[3]*Virginia3_vs_Setosa1a[4])

       Virginia3_vs_Setosa2  <- as.numeric(
                 Virginia3_vs_Setosa2a[1]*Virginia3_vs_Setosa2a[2]*
                 Virginia3_vs_Setosa2a[3]*Virginia3_vs_Setosa2a[4])


       Virginia3_vs_Setosa3  <- as.numeric(
                 Virginia3_vs_Setosa3a[1]*Virginia3_vs_Setosa3a[2]*
                 Virginia3_vs_Setosa3a[3]*Virginia3_vs_Setosa3a[4])

         rm(Virginia3_vs_Setosa1a, Virginia3_vs_Setosa2a, 
            Virginia3_vs_Setosa3a)

最后创建一个像下面这样的 3*3 矩阵

matrix(c(Virginia1_vs_Setosa1, Virginia1_vs_Setosa2, Virginia1_vs_Setosa3, Virginia2_vs_Setosa1, Virginia2_vs_Setosa2, Virginia2_vs_Setosa3,
  Virginia3_vs_Setosa1, Virginia3_vs_Setosa2, Virginia3_vs_Setosa3), nrow=3, ncol=3)


       [,1]     [,2]    [,3]
[1,] -3.0780   0.0000  4.9020
[2,] -2.2032 -26.0568 -8.8236
[3,] -6.0192 -17.3712 -4.6440

如您所见,我的解决方案非常笨拙且效率低下。如果有人能向我展示实现相同结果的有效方法,我将非常感激。

【问题讨论】:

    标签: r dplyr apply


    【解决方案1】:

    您可以使用双 for 循环来执行此操作。也许有 *apply 系列函数的解决方案,但这个可行。

    f <- droplevels(df$Species[df$Species != "versicolor"])
    sp <- split(df[df$Species != "versicolor", ], f)
    
    res <- matrix(0, 3, 3)
    for(i in 1:nrow(sp[[1]])){
      for(j in 1:nrow(sp[[2]])){
        res[i, j] <- prod(sp[[2]][j, -5] - sp[[1]][i, -5])
      }
    }
    
    res
    #        [,1]     [,2]    [,3]
    #[1,] -3.0780   0.0000  4.9020
    #[2,] -2.2032 -26.0568 -8.8236
    #[3,] -6.0192 -17.3712 -4.6440
    

    【讨论】:

      【解决方案2】:

      对于这种特殊情况,你可以借鉴outer的一些想法

      X <- lapply(split(df[df$Species=="virginica", 1:4], 1:3), unlist)
      Y <- lapply(split(df[df$Species=="setosa", 1:4], 1:3), unlist)
      
      FUN <- function(l1, l2) mapply(function(v,w) prod(v-w), l1, l2)
      Y <- rep(Y, rep.int(length(X), length(Y)))
      if (length(X)) 
          X <- rep(X, times = ceiling(length(Y)/length(X)))
      matrix(FUN(X, Y), ncol=3L, byrow=TRUE)
      

      对于最一般的情况,您需要生成每对可能的不同行,然后根据您的公式进行计算。使用data.table,会是这样的:

      library(data.table)
      setDT(df)
      setorder(df, Species)[, numid := rowid(Species)]
      
      parts <- split(df, by=c("Species", "numid"))
      combis <- CJ(parts, parts, sorted=FALSE)
      combis[, .(
              Species1=V1[[1]][,Species], 
              numid1=V1[[1]][,numid],
              Species2=V2[[1]][,Species], 
              numid2=V2[[1]][,numid],
              differ=prod(V1[[1]][, 1:4] - V2[[1]][, 1:4])), 
          by=seq_len(combis[,.N])][
              Species1!=Species2, -1L]
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        相关资源
        最近更新 更多