【问题标题】:Apply a custom function to pairs of columns in a dataframe将自定义函数应用于数据框中的列对
【发布时间】:2021-09-20 12:44:34
【问题描述】:

我想将自定义函数应用于数据框中的所有列对,以获得结果的 p x p 矩阵/数据框。在 tidyverse 中有没有快速的方法来做到这一点?

输出应该是results 数据框。

custom_function <- function(x, y){
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

result <- tibble(cols = c("x","y","z"), 
                 x = c(custom_function(data$x, data$x), custom_function(data$x, data$y), custom_function(data$x, data$z)),
                 y = c(custom_function(data$y, data$x), custom_function(data$y, data$y), custom_function(data$y, data$z)),
                 z = c(custom_function(data$z, data$x), custom_function(data$z, data$y), custom_function(data$z, data$z)))

result

【问题讨论】:

  • 基本 R 方式:sapply(data, function(y) sapply(data, custom_function, y = y)).

标签: r dplyr tidyverse data-manipulation purrr


【解决方案1】:

您可以使用以下解决方案:

library(dplyr)
library(tibble)

expand.grid(names(data), names(data)) %>%
  rowwise() %>%
  mutate(Res = custom_function(data[as.character(Var1)], data[as.character(Var2)])) %>%
  pivot_wider(names_from = unique("Var1"), values_from = "Res") %>% 
  column_to_rownames("Var2")

           x        y         z
x -0.3591433 2.157343 -1.470995
y  2.1573430 4.673829  1.045491
z -1.4709953 1.045491 -2.582847

【讨论】:

  • 是的,使用 3 maps 出现故障,我想我需要一趟。
【解决方案2】:

如果custom_function 是矢量化的,我们可以直接使用outer。但它使用的是sum,这是一个标量函数,因此我们可以通过将它包裹在FUN = 中的FUN = 参数中的Vectorize() 周围来使用它@ 987654327@。这样做-

outer(names(data),names(data), FUN = Vectorize(function(x, y) custom_function (data[x], data[y])))


tidyverse 策略虽然有点冗长,但如果需要,您可以在 tidyverse 中管理此方法。

library(tidyverse)
custom_function <- function(x, y){
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

expand.grid(names(data), names(data)) %>% 
  mutate(val = map2(Var1, Var2, ~ custom_function(data[.x], data[.y]))) %>%
  pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
  column_to_rownames('Var1') %>%
  as.matrix()
#>            x        y         z
#> x -0.3591433 2.157343 -1.470995
#> y  2.1573430 4.673829  1.045491
#> z -1.4709953 1.045491 -2.582847

reprex package (v2.0.0) 于 2021-07-10 创建

【讨论】:

    【解决方案3】:

    一个想法:

    library(dplyr, warn.conflicts = FALSE)
    
    custom_function <- function(x, y) {
      sum(x, y)
    }
    
    set.seed(100)
    data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
    
    data_long <-
      data %>%
      mutate(id = 1:nrow(.)) %>%
      tidyr::pivot_longer(cols = -id)
    
    result <-  
      data_long %>%
      inner_join(data_long, by = "id") %>%
      group_by(name.x, name.y) %>%
      summarize(value = custom_function(value.x, value.y),
                .groups = "drop") %>%
      tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
      rename(cols = name.y)
    
    result
    #> # A tibble: 3 x 4
    #>   cols       x     y     z
    #>   <chr>  <dbl> <dbl> <dbl>
    #> 1 x     -0.359  2.16 -1.47
    #> 2 y      2.16   4.67  1.05
    #> 3 z     -1.47   1.05 -2.58
    

    reprex package (v2.0.0) 于 2021-07-10 创建

    在这里它被组织成一个函数:

    library(dplyr, warn.conflicts = FALSE)
    
    custom_function <- function(x, y) {
      sum(x, y)
    }
    
    set.seed(100)
    data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
    
    custom_summ <- function(df, f) {
      
      data_long <-
        data %>%
        mutate(id = 1:nrow(.)) %>%
        tidyr::pivot_longer(cols = -id)
    
      result <-  
        data_long %>%
        inner_join(data_long, by = "id") %>%
        group_by(name.x, name.y) %>%
        summarize(value = f(value.x, value.y),
                  .groups = "drop") %>%
        tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
        rename(cols = name.y)
      
      result
    }
    
    custom_summ(data, custom_function)
    #> # A tibble: 3 x 4
    #>   cols       x     y     z
    #>   <chr>  <dbl> <dbl> <dbl>
    #> 1 x     -0.359  2.16 -1.47
    #> 2 y      2.16   4.67  1.05
    #> 3 z     -1.47   1.05 -2.58
    

    reprex package (v2.0.0) 于 2021-07-10 创建

    这里是各种选项的一些基准数据。如果性能完全是一个问题,那么接受的答案中提供的 tidyverse 方法就不是一个好的方法。这里最快的选项是在对问题的评论中提供的基于sapply 的选项。

    library(tidyverse)
    
    custom_function <- function(x, y) {
      sum(x, y)
    }
    
    set.seed(100)
    
    get_data <- function() {
      data <- lapply(letters, function(i) rnorm(1000))
      names(data) <- letters
      as_tibble(data)
    }
    
    custom_summ <- function(df, f) {
      
      data_long <-
        data %>%
        mutate(id = 1:nrow(.)) %>%
        pivot_longer(cols = -id)
      
      result <-  
        data_long %>%
        inner_join(data_long, by = "id") %>%
        group_by(name.x, name.y) %>%
        summarize(value = f(value.x, value.y),
                  .groups = "drop") %>%
        pivot_wider(names_from = name.x, values_from = value) %>%
        rename(cols = name.y)
      
      result
    }
    
    data <- get_data()
    
    system.time(custom_summ(data, custom_function))
    #>    user  system elapsed 
    #>   0.053   0.007   0.062
    
    custom_summ_2 <- function(data, f) {
      expand.grid(names(data), names(data)) %>% 
      mutate(val = map2(Var1, Var2, ~ f(data[.x], data[.y]))) %>%
      pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
      column_to_rownames('Var1') %>%
      as.matrix()
    }
    
    system.time(custom_summ_2(data, custom_function))
    #>    user  system elapsed 
    #>  26.479   0.317  27.365
    
    custom_summ_3 <- function(data, f) {
      expand.grid(names(data), names(data)) %>%
        rowwise() %>%
        mutate(Res = f(data[as.character(Var1)], data[as.character(Var2)])) %>%
        pivot_wider(names_from = unique("Var1"), values_from = "Res") %>% 
        column_to_rownames("Var2")
    }
    
    system.time(custom_summ_3(data, custom_function))
    #>    user  system elapsed 
    #>   0.048   0.001   0.049
    
    custom_summ_4 <- function(data, f) {
      sapply(data, function(y) sapply(data, f, y = y))
    }
    
    system.time(custom_summ_4(data, custom_function))
    #>    user  system elapsed 
    #>   0.003   0.000   0.003
    
    custom_summ_5 <- function(data, f) {
      outer(names(data), names(data), 
            FUN = Vectorize(function(x, y) f (data[x], data[y])))
    }
    
    system.time(custom_summ_5(data, custom_function))
    #>    user  system elapsed 
    #>   0.044   0.001   0.045
    

    reprex package (v2.0.0) 于 2021-07-11 创建

    【讨论】:

      猜你喜欢
      • 2020-10-28
      • 2023-03-06
      • 2018-03-03
      • 2021-08-03
      • 2017-01-28
      • 1970-01-01
      • 1970-01-01
      • 2018-11-10
      • 2018-01-04
      相关资源
      最近更新 更多