【问题标题】:Unable to summarize the minimum and maximum while using for loop使用 for 循环时无法汇总最小值和最大值
【发布时间】:2018-07-07 11:16:14
【问题描述】:

以下是随机数据。

drop    drop1   drop2   ch
15  14  40  1
20  15  45  1
35  16  90  1
40  17  70  0
25  18  80  0
30  18  90  0
11  20  100 0
13  36  11  0
16  70  220 0
19  40  440 1
25  45  1   1
35  30  70  1
40  40  230 1
17  11  170 1
30  2   160 1

我正在使用下面的代码对 R 中的连续变量进行变量分析。

library(dplyr)
dt %>% mutate(dec=ntile(drop, n=2)) %>% 
  count(ch, dec) %>%
  filter(ch == 1) -> datcbld

datcbld$N <- unclass(dt %>% 
                       mutate(dec=ntile(drop, n=2)) %>%
                       count(dec) %>% 
                       unname())[[2]]
datcbld$ch_perc <- datcbld$n / datcbld$N
datcbld$GreaterThan <- unclass(dt %>% mutate(dec=ntile(drop, n=2)) %>%
                                 group_by(dec) %>% 
                                 summarise(min(drop)))[[2]]
datcbld$LessThan <- unclass(dt %>% 
                              mutate(dec=ntile(drop, n=2)) %>% 
                              group_by(dec) %>% 
                              summarise(max(drop)))[[2]]
datcbld$Varname <- rep("dt", nrow(datcbld))

下面是代码的输出。

ch  dec n   N   ch_perc GreaterThan LessThan    Varname
1   1   4   8   0.5 11  25  drop
1   2   5   7   0.714285714 25  40  drop

当我将这段代码用于单个变量时,它可以正常工作。

当我尝试使用 for 循环为每一列运行它时,它无法对每个十分位数的最小值和最大值进行汇总。

下面是我使用 for 循环运行的代码。

finaldata <- data.frame()

for(i in 1:(ncol(dt) - 1)){
  dt %>% 
    mutate(dec=ntile(dt[, colnames(dt[i])], n = 2)) %>%
    count(ch,dec) %>%
    filter(ch == 1) -> dat
  dat$N <- unclass(dt %>% 
                     mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
                     count(dec) %>%
                     unname())[[2]]
  dat$ch_perc <- dat$n / dat$N
  dat$GreaterThan <- unclass(dt %>% 
                               mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
                               group_by(dec) %>%
                               summarise(min(dt[, colnames(dt[i])])))[[2]]
  dat$LessThan <- unclass(dt %>%
                            mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
                            group_by(dec) %>% 
                            summarise(max(dt[, colnames(dt[i])])))[[2]]
  dat$Varname <- rep(colnames(dt[i]), nrow(dat))
  finaldata <- rbind(finaldata, dat)
}

但我无法得到相同的结果。

【问题讨论】:

    标签: r for-loop dplyr


    【解决方案1】:

    我们可以使用 map 通过循环名称来做到这一点,这可以在不中断链的情况下完成 (%&gt;%)

    library(tidyverse)
    names(dt)[1:3] %>% 
         map_df(~
               dt %>% 
                select(.x, ch) %>% 
                mutate(dec = ntile(!! rlang::sym(.x), n = 2)) %>% 
                group_by(dec) %>% 
                mutate(N = n(), 
                       GreaterThan = max(!!rlang::sym(.x)), 
                       LessThan = min(!!rlang::sym(.x))) %>% 
                select(-1) %>% 
                count(!!! rlang::syms(names(.))) %>%               
                filter(ch == 1)%>% 
                mutate(ch_perc = n/N,
                       Varname = .x)) 
    # A tibble: 6 x 8
    # Groups:   dec [2]
    #    dec    ch     N GreaterThan LessThan     n ch_perc Varname  
    #  <int> <int> <int>       <dbl>    <dbl> <int>   <dbl> <chr>
    #1     1     1     8          25       11     4   0.5   drop 
    #2     2     1     7          40       25     5   0.714 drop 
    #3     1     1     8          18        2     5   0.625 drop1
    #4     2     1     7          70       20     4   0.571 drop1
    #5     1     1     8          90        1     5   0.625 drop2
    #6     2     1     7         440       90     4   0.571 drop2
    

    OP 的for 循环中的问题是使用

    dt[, colnames(dt[i])]
    

    summarise 内。它将在整个列值上应用minmax,而不是在关于按结构分组的列上应用函数

    我们可以将列名转换为如上所示的符号 (sym) 并进行评估或使用 summarise_at

    finaldata <- data.frame()                          
     for(i in 1:(ncol(dt) - 1)){
      dt %>% 
        mutate(dec=ntile(dt[, colnames(dt[i])], n = 2)) %>%
        count(ch,dec) %>%
        filter(ch == 1) -> dat
      dat$N <- unclass(dt %>% 
                         mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
                         count(dec) %>%
                         unname())[[2]]
      dat$ch_perc <- dat$n / dat$N
      dat$GreaterThan <- unclass(dt %>% 
                                   mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
                                   group_by(dec) %>%
                                   summarise(max(!! rlang::sym(names(dt)[i]))))[[2]]
    
      dat$LessThan <- unclass(dt %>%
                                mutate(dec=ntile(dt[, colnames(dt[i])], n=2)) %>%
                                group_by(dec) %>% 
                                summarise(min(!! rlang::sym(names(dt)[i]))))[[2]]
    
      dat$Varname <- rep(colnames(dt[i]), nrow(dat))
      finaldata <- rbind(finaldata, dat)
    }
    
    finaldata
    # A tibble: 6 x 8
    #     ch   dec     n     N ch_perc GreaterThan LessThan Varname
    #  <int> <int> <int> <int>   <dbl>       <dbl>    <dbl> <chr>  
    #1     1     1     4     8   0.5            25       11 drop   
    #2     1     2     5     7   0.714          40       25 drop   
    #3     1     1     5     8   0.625          18        2 drop1  
    #4     1     2     4     7   0.571          70       20 drop1  
    #5     1     1     5     8   0.625          90        1 drop2  
    #6     1     2     4     7   0.571         440       90 drop2  
    

    数据

    dt <- structure(list(drop = c(15L, 20L, 35L, 40L, 25L, 30L, 11L, 13L, 
    16L, 19L, 25L, 35L, 40L, 17L, 30L), drop1 = c(14L, 15L, 16L, 
    17L, 18L, 18L, 20L, 36L, 70L, 40L, 45L, 30L, 40L, 11L, 2L), drop2 = c(40L, 
    45L, 90L, 70L, 80L, 90L, 100L, 11L, 220L, 440L, 1L, 70L, 230L, 
    170L, 160L), ch = c(1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
    1L, 1L, 1L, 1L)), .Names = c("drop", "drop1", "drop2", "ch"),
     class = "data.frame", row.names = c(NA, 
    -15L))
    

    【讨论】:

    • 非常感谢@akrun。真的很有帮助
    猜你喜欢
    • 2018-09-21
    • 1970-01-01
    • 1970-01-01
    • 2018-08-17
    • 2019-09-18
    • 2016-10-06
    • 1970-01-01
    • 1970-01-01
    • 2021-11-27
    相关资源
    最近更新 更多