【问题标题】:Remove outliers for every factor of a column in a dataframe删除数据框中列的每个因子的异常值
【发布时间】:2022-01-22 20:30:13
【问题描述】:

我有下面的数据框,我想查找并删除我的数据框 Area 列的每个因素的异常值。通常因素大于 2。因此,我想要新的数据框,而每个因素都没有异常值。

    subs<-structure(list(Sold_Pr = c(6500, 173000, 60000, 73000, 155000, 
105000, 140000, 39900, 73500, 46000, 99900, 180000, 164000, 120000, 
206000, 160000, 67400, 215000, 145000, 175000, 350000, 425000, 
435000, 490000, 545000, 585000), Area = structure(c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("411", "415", "981", 
"8001", "8002", "8003", "8004", "8005", "8006", "8007", "8008", 
"8009", "8010", "8011", "8012", "8013", "8014", "8015", "8016", 
"8017", "8018", "8019", "8020", "8021", "8022", "8023", "8024", 
"8025", "8026", "8027", "8028", "8029", "8030", "8031", "8034", 
"8035", "8037", "8038", "8039", "8040", "8041", "8042", "8043", 
"8044", "8045", "8046", "8047", "8048", "8049", "8050", "8051", 
"8052", "8053", "8055", "8056", "8057", "8058", "8059", "8060", 
"8061", "8062", "8063", "8064", "8065", "8066", "8067", "8068", 
"8069", "8070", "8071", "8072", "8073", "8074", "8075", "8076", 
"8077"), class = "factor"), Closed_Date = structure(c(18668, 
18933, 18716, 18740, 18639, 18845, 18708, 18676, 18733, 18695, 
18715, 18709, 18794, 18803, 18750, 18787, 18906, 18810, 18855, 
18870, 18626, 18786, 18808, 18864, 18961, 18914), class = "Date")), row.names = c(NA, 
-26L), class = c("tbl_df", "tbl", "data.frame"))

remove_outliers <- function(x, na.rm = TRUE, ...) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
  H <- 1.5 * IQR(x, na.rm = na.rm)
  y <- x
  y[x < (qnt[1] - H)] <- NA
  y[x > (qnt[2] + H)] <- NA
  y
}

【问题讨论】:

  • 您想删除异常值,还是像您的函数一样将它们替换为NA
  • 完全删除它们

标签: r


【解决方案1】:

您可以更改函数以返回逻辑值并在filtergroup_by 中使用它们-

library(dplyr)

remove_outliers <- function(x, na.rm = TRUE, ...) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
  H <- 1.5 * IQR(x, na.rm = na.rm)
  !(x < (qnt[1] - H) | x > (qnt[2] + H))
}

subs %>%
  group_by(Area) %>%
  filter(remove_outliers(Sold_Pr)) %>%
  ungroup

【讨论】:

    【解决方案2】:

    您可以使用ave 将分组添加到函数中。对于截止值的计算,您可以使用mapply

    remove_outliers <- \(x, g, na.rm=TRUE, ...) {
      q <- \(z) {
        qnt <- quantile(z, probs=c(.25, .75), na.rm=na.rm, ...)
        H <- 1.5 * IQR(z, na.rm=na.rm)
        r <- mapply(`+`, qnt, c(-H, H))
        z > r[1] & z < r[2]
      }
      return(as.logical(ave(x, as.character(g), FUN=q)))
    }
    
    subs[26, 1] <- 1e9  ## fabricate outlier
    

    给予

    with(subs, remove_outliers(Sold_Pr, Area))
    # [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
    # [14]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
    

    应用于子集

    subs[with(subs, remove_outliers(Sold_Pr, Area)), ]
    #    Sold_Pr Area Closed_Date
    # 1     6500  411  2021-02-10
    # 2   173000  411  2021-11-02
    # 3    60000  411  2021-03-30
    # 4    73000  411  2021-04-23
    # 5   155000  411  2021-01-12
    # 6   105000  411  2021-08-06
    # 7   140000  411  2021-03-22
    # 8    39900  411  2021-02-18
    # 9    73500  411  2021-04-16
    # 10   46000  411  2021-03-09
    # 11   99900  411  2021-03-29
    # 12  180000  411  2021-03-23
    # 13  164000  411  2021-06-16
    # 14  120000  411  2021-06-25
    # 15  206000  411  2021-05-03
    # 16  160000  411  2021-06-09
    # 17   67400  411  2021-10-06
    # 18  215000  411  2021-07-02
    # 19  145000  411  2021-08-16
    # 20  175000  411  2021-08-31
    # 21  350000  415  2020-12-30
    # 22  425000  415  2021-06-08
    # 23  435000  415  2021-06-30
    # 24  490000  415  2021-08-25
    # 25  545000  415  2021-11-30
    

    注意: R 版本 4.1.2 (2021-11-01)

    【讨论】:

      猜你喜欢
      • 2017-11-28
      • 2020-07-26
      • 1970-01-01
      • 2021-03-24
      • 2021-03-26
      • 2022-12-22
      • 2020-08-19
      • 1970-01-01
      • 2020-11-17
      相关资源
      最近更新 更多