【问题标题】:select group before certain observations separated by grouping var in R with NA control在某些观察之前选择组,通过将 R 中的 var 与 NA 控制进行分组来分隔
【发布时间】:2019-01-30 23:24:48
【问题描述】:

我的样本。

 data=structure(list(add = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("x", 
"y"), class = "factor"), x1 = c(14L, 15L, 36L, 0L, 0L, 0L, 53L, 
10L, 39L, 27L, 67L, 25L, 19L, 49L, 53L, 64L, 61L, 12L, 75L, 34L, 
88L, 43L, 85L, 93L, 44L, 31L, 37L, 90L, 66L, 39L, 59L, 96L, 41L, 
23L, 20L, 26L, 69L, 28L, 35L, 96L, 87L, 82L, 70L, 68L, 26L, 12L, 
58L, 18L, 76L, 93L, 3L, 31L), group = structure(c(2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L), .Label = c("female", "male"), class = "factor")), .Names = c("add", 
"x1", "group"), class = "data.frame", row.names = c(NA, -52L))

在此数据中,有组变量(性别(男性和女性)我需要获得所有男性的统计平均值和 25 个百分位,这些男性在女性之前。男性在女性之后,我不碰。还有女性我不触碰。 这是从添加列按组xy 拆分的分析。 如果男性在女性之后 x1 > 25 个百分位,这是我们在女性之前为男性计算的,那么这个值必须替换为男性在女性之前的平均值“我们不接触的女性类别。

AntoniosK的解决方案非常好

library(tidyverse)
library(data.table)

data %>%  
  group_by(add) %>%                                           # for each add do the below...
  mutate(group2 = rleid(group)) %>% 
  group_by(add, group, group2) %>%
  mutate(MEAN = mean(x1[group=="male" & group2==1]),               
         Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
  group_by(add) %>%                                            # for each add update x1 values....
  mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1)) %>%
  ungroup() %>%
  select(-group2) %>%
  data.frame()

但现在我想用 x1 将 0 值替换为 Na。

data$x1[data$x1 == 0] <- NA

之后,当我取消脚本时,我得到 错误

mutate_impl(.data, dots) 中的错误:评估错误:缺失 如果 'na.rm' 为 FALSE,则不允许使用 NaN。

怎么办,该脚本通过了 NA 并且只使用 int 值?

编辑

data=structure(list(add = c(11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 
                       11202L, 11202L, 11202L, 11202L, 11202L, 11202L, 11202L), x1 = c(NA, 
                                                                                       2L, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, 1L, 1L, NA, NA, NA, 
                                                                                       NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, 
                                                                                       NA, NA, NA, NA, NA, NA, NA, 3L, NA, NA, NA, NA, 1L, 1L, NA, NA, 
                                                                                       NA, NA, NA), group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                                                                                                        2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
                                                                                                                        2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 
                                                                                                                        2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("female", 
                                                                                                                                                                                        "male"), class = "factor")), .Names = c("add", "x1", "group"), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                                                                           -52L))

library(tidyverse)
library(data.table)

data %>%  
  group_by(add) %>%                                          
  mutate(group2 = rleid(group)) %>% 
  group_by(add, group, group2) %>%
  mutate(MEAN = mean(x1[group=="male" & group2==1]),               
         Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
  group_by(add) %>%                                           
  mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1),
         x1 = ifelse(x1==0, NA, x1)) %>%  # new code added
  ungroup() %>%
  select(-group2) %>%
  data.frame()

编辑2

代码结果

add x1  group   MEAN    Q25
x   14.00000    male    23.72727    5.0
x   15.00000    male    23.72727    5.0
x   36.00000    male    23.72727    5.0
x   0.00000 male    23.72727    5.0
x   0.00000 male    23.72727    5.0
x   0.00000 male    23.72727    5.0
x   53.00000    male    23.72727    5.0
x   10.00000    male    23.72727    5.0
x   39.00000    male    23.72727    5.0
x   27.00000    male    23.72727    5.0
x   67.00000    male    23.72727    5.0
x   25.00000    female  NaN NA
x   19.00000    female  NaN NA
x   49.00000    female  NaN NA
x   53.00000    female  NaN NA
x   64.00000    female  NaN NA
x   61.00000    female  NaN NA
x   12.00000    female  NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA
x   23.72727    male    NaN NA

之后

add x1     group
x   94.90   male

女后前4男的总和=94.90

【问题讨论】:

  • 您的列名之一是“此分析如何按组拆分?组”。我在data 中看不到任何列ReturnCount。当我运行该代码时,我在任何地方都看不到任何零值。
  • ReturnCount 是 x1。
  • 我认为如果您发布一个 x1 为零的示例会更容易,这样我们就可以检查您收到该错误的原因。
  • @AntoniosK,我用零编辑了帖子。请检查
  • @AntoniosK,我提供了。

标签: r dplyr data.table


【解决方案1】:

我添加了一段可以解决您的问题的代码,并简要说明了错误。

更新代码

data=structure(list(add = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("x", 
"y"), class = "factor"), x1 = c(14L, 15L, 36L, 0L, 0L, 0L, 53L, 
10L, 39L, 27L, 67L, 25L, 19L, 49L, 53L, 64L, 61L, 12L, 75L, 34L, 
88L, 43L, 85L, 93L, 44L, 31L, 37L, 90L, 66L, 39L, 59L, 96L, 41L, 
23L, 20L, 26L, 69L, 28L, 35L, 96L, 87L, 82L, 70L, 68L, 26L, 12L, 
58L, 18L, 76L, 93L, 3L, 31L), group = structure(c(2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L), .Label = c("female", "male"), class = "factor")), .Names = c("add", 
"x1", "group"), class = "data.frame", row.names = c(NA, -52L))

library(tidyverse)
library(data.table)

data %>%  
  group_by(add) %>%                                          
  mutate(group2 = rleid(group)) %>% 
  group_by(add, group, group2) %>%
  mutate(MEAN = mean(x1[group=="male" & group2==1]),               
         Q25 = quantile(x1[group=="male" & group2==1], 0.25)) %>%
  group_by(add) %>%                                           
  mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1),
         x1 = ifelse(x1==0, NA, x1)) %>%  # new code added
  ungroup() %>%
  select(-group2) %>%
  data.frame()

错误说明

您必须运行前一部分代码,最后只需更新x1 列。您收到该错误是因为 NA 值破坏了您需要执行的 meanquantile 计算。

另一种方法是在开始时更新x1,然后使用na.rm=T 进行计算。

对于您提到的新案例,您从NA 值开始,x1 试试这个:

data %>%  
  group_by(add) %>%                                          
  mutate(group2 = rleid(group)) %>% 
  group_by(add, group, group2) %>%
  mutate(MEAN = mean(x1[group=="male" & group2==1], na.rm = T),      ## extra code here ##    
         Q25 = quantile(x1[group=="male" & group2==1], 0.25, na.rm = T)) %>%  ## extra code here ##
  group_by(add) %>%                                           
  mutate(x1 = ifelse(group=="male" & group2==3 & x1 > unique(Q25[!is.na(Q25)]), unique(MEAN[!is.na(MEAN)]), x1))%>%
  ungroup() %>%
  select(-group2) %>%
  data.frame()

对于您提到的新案例(编辑2),首先将之前代码的输出保存为data2

data2 = data %>% ...

然后运行这个:

data2 %>%
  group_by(add) %>%                           # for each add value                      
  mutate(group2 = rleid(group)) %>%           # created group2
  filter(group=="male" & group2==3) %>%       # keep only male after female
  summarise(SUM = sum(x1[row_number() <= 4])) # get sum of x1 for first 4 rows

# # A tibble: 2 x 2
#   add     SUM
#   <fct> <dbl>
# 1 x      94.9
# 2 y     107.

【讨论】:

  • AntoniosK,您的代码是否适用于我的数据?我收到评估错误:如果 'na.rm' 为 FALSE,则不允许缺少值和 NaN。
  • 是的,它应该运行,我现在已经发布了整个事情。启动一个新的 R 会话并运行上面的代码。它应该运行。然后检查您的整个数据集。
  • 我用新数据编辑了帖子。我在 mutate_impl(.data, dots) 中收到错误错误:评估错误:如果 'na.rm' 为 FALSE,则不允许缺少值和 NaN。你能检查一下编辑过的数据并解释为什么它不起作用吗?
  • x1 列中没有零,而是 NAs。与您发布的上一个示例相比,此数据集具有完全不同的理念。之前,我们应用了该过程,最后,我们将零更改为NA。现在你从一开始就有NAs。您应该尝试我描述的替代方法。使用na.rm=T。你知道怎么做吗?
  • AntoniosK,不,但如果你告诉我如何使用替代方案,我将不胜感激
猜你喜欢
  • 1970-01-01
  • 1970-01-01
  • 2022-01-05
  • 1970-01-01
  • 2018-03-28
  • 2017-10-10
  • 2017-06-29
  • 2017-05-24
  • 1970-01-01
相关资源
最近更新 更多