将自定义函数应用于数据框中的列对答案

【问题标题】：Apply a custom function to pairs of columns in a dataframe将自定义函数应用于数据框中的列对
【发布时间】：2021-09-20 12:44:34
【问题描述】：

我想将自定义函数应用于数据框中的所有列对，以获得结果的 p x p 矩阵/数据框。在 tidyverse 中有没有快速的方法来做到这一点？

输出应该是results 数据框。

custom_function <- function(x, y){
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

result <- tibble(cols = c("x","y","z"), 
                 x = c(custom_function(data$x, data$x), custom_function(data$x, data$y), custom_function(data$x, data$z)),
                 y = c(custom_function(data$y, data$x), custom_function(data$y, data$y), custom_function(data$y, data$z)),
                 z = c(custom_function(data$z, data$x), custom_function(data$z, data$y), custom_function(data$z, data$z)))

result

【问题讨论】：

基本 R 方式：sapply(data, function(y) sapply(data, custom_function, y = y)).

标签： r dplyr tidyverse data-manipulation purrr

【解决方案1】：

您可以使用以下解决方案：

library(dplyr)
library(tibble)

expand.grid(names(data), names(data)) %>%
  rowwise() %>%
  mutate(Res = custom_function(data[as.character(Var1)], data[as.character(Var2)])) %>%
  pivot_wider(names_from = unique("Var1"), values_from = "Res") %>% 
  column_to_rownames("Var2")

           x        y         z
x -0.3591433 2.157343 -1.470995
y  2.1573430 4.673829  1.045491
z -1.4709953 1.045491 -2.582847

【讨论】：

是的，使用 3 maps 出现故障，我想我需要一趟。

【解决方案2】：

如果custom_function 是矢量化的，我们可以直接使用outer。但它使用的是sum，这是一个标量函数，因此我们可以通过将它包裹在FUN = 中的FUN = 参数中的Vectorize() 周围来使用它@ 987654327@。这样做-

outer(names(data),names(data), FUN = Vectorize(function(x, y) custom_function (data[x], data[y])))

tidyverse 策略虽然有点冗长，但如果需要，您可以在 tidyverse 中管理此方法。

library(tidyverse)
custom_function <- function(x, y){
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

expand.grid(names(data), names(data)) %>% 
  mutate(val = map2(Var1, Var2, ~ custom_function(data[.x], data[.y]))) %>%
  pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
  column_to_rownames('Var1') %>%
  as.matrix()
#>            x        y         z
#> x -0.3591433 2.157343 -1.470995
#> y  2.1573430 4.673829  1.045491
#> z -1.4709953 1.045491 -2.582847

^{由 reprex package (v2.0.0) 于 2021-07-10 创建}

【讨论】：

【解决方案3】：

一个想法：

library(dplyr, warn.conflicts = FALSE)

custom_function <- function(x, y) {
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

data_long <-
  data %>%
  mutate(id = 1:nrow(.)) %>%
  tidyr::pivot_longer(cols = -id)

result <-  
  data_long %>%
  inner_join(data_long, by = "id") %>%
  group_by(name.x, name.y) %>%
  summarize(value = custom_function(value.x, value.y),
            .groups = "drop") %>%
  tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
  rename(cols = name.y)

result
#> # A tibble: 3 x 4
#>   cols       x     y     z
#>   <chr>  <dbl> <dbl> <dbl>
#> 1 x     -0.359  2.16 -1.47
#> 2 y      2.16   4.67  1.05
#> 3 z     -1.47   1.05 -2.58

^{由 reprex package (v2.0.0) 于 2021-07-10 创建}

在这里它被组织成一个函数：

library(dplyr, warn.conflicts = FALSE)

custom_function <- function(x, y) {
  sum(x, y)
}

set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))

custom_summ <- function(df, f) {
  
  data_long <-
    data %>%
    mutate(id = 1:nrow(.)) %>%
    tidyr::pivot_longer(cols = -id)

  result <-  
    data_long %>%
    inner_join(data_long, by = "id") %>%
    group_by(name.x, name.y) %>%
    summarize(value = f(value.x, value.y),
              .groups = "drop") %>%
    tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
    rename(cols = name.y)
  
  result
}

custom_summ(data, custom_function)
#> # A tibble: 3 x 4
#>   cols       x     y     z
#>   <chr>  <dbl> <dbl> <dbl>
#> 1 x     -0.359  2.16 -1.47
#> 2 y      2.16   4.67  1.05
#> 3 z     -1.47   1.05 -2.58

^{由reprex package (v2.0.0) 于 2021-07-10 创建}

这里是各种选项的一些基准数据。如果性能完全是一个问题，那么接受的答案中提供的 tidyverse 方法就不是一个好的方法。这里最快的选项是在对问题的评论中提供的基于sapply 的选项。

library(tidyverse)

custom_function <- function(x, y) {
  sum(x, y)
}

set.seed(100)

get_data <- function() {
  data <- lapply(letters, function(i) rnorm(1000))
  names(data) <- letters
  as_tibble(data)
}

custom_summ <- function(df, f) {
  
  data_long <-
    data %>%
    mutate(id = 1:nrow(.)) %>%
    pivot_longer(cols = -id)
  
  result <-  
    data_long %>%
    inner_join(data_long, by = "id") %>%
    group_by(name.x, name.y) %>%
    summarize(value = f(value.x, value.y),
              .groups = "drop") %>%
    pivot_wider(names_from = name.x, values_from = value) %>%
    rename(cols = name.y)
  
  result
}

data <- get_data()

system.time(custom_summ(data, custom_function))
#>    user  system elapsed 
#>   0.053   0.007   0.062

custom_summ_2 <- function(data, f) {
  expand.grid(names(data), names(data)) %>% 
  mutate(val = map2(Var1, Var2, ~ f(data[.x], data[.y]))) %>%
  pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
  column_to_rownames('Var1') %>%
  as.matrix()
}

system.time(custom_summ_2(data, custom_function))
#>    user  system elapsed 
#>  26.479   0.317  27.365

custom_summ_3 <- function(data, f) {
  expand.grid(names(data), names(data)) %>%
    rowwise() %>%
    mutate(Res = f(data[as.character(Var1)], data[as.character(Var2)])) %>%
    pivot_wider(names_from = unique("Var1"), values_from = "Res") %>% 
    column_to_rownames("Var2")
}

system.time(custom_summ_3(data, custom_function))
#>    user  system elapsed 
#>   0.048   0.001   0.049

custom_summ_4 <- function(data, f) {
  sapply(data, function(y) sapply(data, f, y = y))
}

system.time(custom_summ_4(data, custom_function))
#>    user  system elapsed 
#>   0.003   0.000   0.003

custom_summ_5 <- function(data, f) {
  outer(names(data), names(data), 
        FUN = Vectorize(function(x, y) f (data[x], data[y])))
}

system.time(custom_summ_5(data, custom_function))
#>    user  system elapsed 
#>   0.044   0.001   0.045

^{由reprex package (v2.0.0) 于 2021-07-11 创建}

【讨论】：