【发布时间】:2021-10-14 10:04:20
【问题描述】:
我试图编写一个函数来删除重复的列(具有相同的内容), 专门成对比较那些具有相同名称和后缀的人。
例如:比较"col1"和"col1_suffix"是否有相同的内容。
我已经编写了一些代码,但也许有一些替代方案可以使其更具可读性? (考虑未来的读者)
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
id = c("3333", "7658", "7759",
"7934", "3327", "4738"),
turn = c("manana", "tarde", "tarde", "tarde",
"tarde", "manana"),
answ_parte_general = c(78.75, 78.75, 76.75, 76.5, 76.5, 75.25),
answ_global = c(78.75, 78.75, 76.75, 76.5, 76.5, 75.25),
answ_r = c(78.75, 78.75, 76.75, 76.5, 76.5, 75.25),
result = c(
"passed",
"passed",
"passed",
"passed",
"passed",
"passed"
),
points = c(29.574, 29.574, 28.892, 28.807, 28.807,
28.381),
answ_r_previous = c(76.75, 77.75, 64.75, 74.5, 68.5, 72.25),
result_previous = c(
"passed",
"passed",
"passed",
"passed",
"passed",
"passed"
),
points_previous = c(28.892, 29.233, 24.801, 28.125, 26.08,
27.358),
diff_points = c(2, 1, 12, 2, 8, 3)
)
df
#> id turn answ_parte_general answ_global answ_r result points
#> 1 3333 manana 78.75 78.75 78.75 passed 29.574
#> 2 7658 tarde 78.75 78.75 78.75 passed 29.574
#> 3 7759 tarde 76.75 76.75 76.75 passed 28.892
#> 4 7934 tarde 76.50 76.50 76.50 passed 28.807
#> 5 3327 tarde 76.50 76.50 76.50 passed 28.807
#> 6 4738 manana 75.25 75.25 75.25 passed 28.381
#> answ_r_previous result_previous points_previous diff_points
#> 1 76.75 passed 28.892 2
#> 2 77.75 passed 29.233 1
#> 3 64.75 passed 24.801 12
#> 4 74.50 passed 28.125 2
#> 5 68.50 passed 26.080 8
#> 6 72.25 passed 27.358 3
drop_repeated_columns <- function(df, suffix = "_previous") {
columns_to_drop <- colnames(df) %>%
purrr::keep( ~ str_detect(., suffix)) %>%
purrr::keep( ~ purrr::map_lgl(., ~ identical(pull(df, .),
pull(
df, str_remove(., suffix)
)))) %>%
c(., str_remove(., suffix))
df %>%
select(-all_of(columns_to_drop))
}
df %>%
drop_repeated_columns()
#> id turn answ_parte_general answ_global answ_r points answ_r_previous
#> 1 3333 manana 78.75 78.75 78.75 29.574 76.75
#> 2 7658 tarde 78.75 78.75 78.75 29.574 77.75
#> 3 7759 tarde 76.75 76.75 76.75 28.892 64.75
#> 4 7934 tarde 76.50 76.50 76.50 28.807 74.50
#> 5 3327 tarde 76.50 76.50 76.50 28.807 68.50
#> 6 4738 manana 75.25 75.25 75.25 28.381 72.25
#> points_previous diff_points
#> 1 28.892 2
#> 2 29.233 1
#> 3 24.801 12
#> 4 28.125 2
#> 5 26.080 8
#> 6 27.358 3
【问题讨论】:
标签: r dataframe tidyverse subset