【问题标题】：R dplyr mutate error: factor variable with different classes within one variableR dplyr mutate error：一个变量中具有不同类的因子变量
【发布时间】：2021-08-30 02:27:27
【问题描述】：

问题

我正在尝试在以下代码中使用 ifelse 和 'dplyr' 'mutate' 来改变因子变量：

      sample <- 
    sample
    set.name('fe.anesthesia.type', 'xyz')  %>% #  I am using it in a function
    group_by(xyz) %>% 
    mutate(
      count = n()
    ) 
 
 sample %>%  # one way (produces an error)
   mutate(xyz = ifelse(count <= 3, 'Other', xyz))
 
 # Error: Problem with `mutate()` input `xyz`.
 # x could not find function "ifesle"
 # ℹ Input `xyz` is `ifesle(count <= 3, "Other", xyz)`.
 # ℹ The error occurred in group 1: xyz = "General".
 
 sample %>%  # Another way (gives warnings)
    mutate(xyz = if(count <= 3) 'Other' else xyz) 
 
 # Warning messages:
 #   1: Problem with `mutate()` input `xyz`.
 # ℹ the condition has length > 1 and only the first element will be used
 # ℹ Input `xyz` is `if (count <= 3) "Other" else xyz`.
 # ℹ The error occurred in group 1: xyz = "General". 
 # 2: Problem with `mutate()` input `xyz`.
 # ℹ the condition has length > 1 and only the first element will be used
 # ℹ Input `xyz` is `if (count <= 3) "Other" else xyz`.
 # ℹ The error occurred in group 2: xyz = "General Hybrid". 
 # 3: Problem with `mutate()` input `xyz`.
 # ℹ the condition has length > 1 and only the first element will be used
 # ℹ Input `xyz` is `if (count <= 3) "Other" else xyz`.
 # ℹ The error occurred in group 4: xyz = "Regional".

我不确定我是否理解这个错误，因为看起来错误告诉我因子变量在一个变量中具有不同的类。我在这里遗漏了什么吗？

样本数据

sample <- 
structure(list(xyz = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 5L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 5L, 1L, 1L, 1L), .Label = c("General", 
"General Hybrid", "None", "Other", "Regional"), class = "factor"), 
    no = c(3836L, 22582L, 3489L, 56783L, 17160L, 59457L, 21015L, 
    73109L, 35800L, 11320L, 17015L, 14639L, 2750L, 36123L, 42128L, 
    20804L, 13297L, 61958L, 75037L, 35601L, 9326L, 8412L, 73942L, 
    32512L, 38306L, 58546L, 51127L, 55525L, 71626L, 61694L, 3156L, 
    70078L, 7481L, 54942L, 38175L, 40982L, 33034L, 77272L, 38722L, 
    4498L, 11944L, 55665L, 9888L, 77044L, 74116L, 4529L, 3065L, 
    65326L, 581L, 5188L, 74358L, 18586L, 59395L, 7695L, 7909L, 
    10592L, 57244L, 59440L, 24212L, 39889L, 48359L, 3453L, 77770L, 
    16089L, 53335L, 17686L, 44352L, 32138L, 33258L, 27062L, 62195L, 
    62217L, 28985L, 66840L, 47078L, 74212L, 708L, 52673L, 5672L, 
    63976L, 68841L, 63008L, 3226L, 64997L, 65305L, 62732L, 67075L, 
    65238L, 58870L, 75497L, 48009L, 74886L, 23146L, 63546L, 21846L, 
    69706L, 48273L, 5171L, 70013L, 40710L), count = c(93L, 93L, 
    93L, 93L, 93L, 93L, 93L, 93L, 2L, 93L, 93L, 93L, 93L, 93L, 
    93L, 93L, 93L, 4L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 
    93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 
    93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 
    1L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 
    93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 93L, 
    4L, 93L, 93L, 93L, 93L, 93L, 4L, 93L, 93L, 93L, 93L, 93L, 
    93L, 93L, 93L, 2L, 93L, 93L, 93L, 93L, 93L, 93L, 4L, 93L, 
    93L, 93L)), row.names = c(NA, -100L), groups = structure(list(
    xyz = structure(c(1L, 2L, 4L, 5L), .Label = c("General", 
    "General Hybrid", "None", "Other", "Regional"), class = "factor"), 
    .rows = structure(list(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
    10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 19L, 20L, 21L, 22L, 
    23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 
    35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 
    47L, 48L, 49L, 50L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 
    60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 
    72L, 73L, 74L, 76L, 77L, 78L, 79L, 80L, 82L, 83L, 84L, 85L, 
    86L, 87L, 88L, 89L, 91L, 92L, 93L, 94L, 95L, 96L, 98L, 99L, 
    100L), c(9L, 90L), 51L, c(18L, 75L, 81L, 97L)), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, 4L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

【问题讨论】：

试试mutate(xyz = if(count <= 30) 'Other' else xyz)
@YBS 它会创建警告信息...
@YBS 警告消息（针对因子变量中的每个组）：1：mutate() 输入问题xyz。 ℹ 条件长度 > 1 且仅使用第一个元素 ℹ 输入 xyz 为 if (count <= 30) "Other" else xyz。 ℹ 错误发生在第 1 组：xyz = "General"。 2：mutate() 输入 xyz 有问题。 ℹ 条件长度 > 1 且仅使用第一个元素 ℹ 输入 xyz 为 if (count <= 30) "Other" else xyz。 ℹ 第二组出现错误：xyz = "General Hybrid"......
尝试在代码的最后一行将mutate(xyz = ifelse(count <= 30, 'Other', xyz))更改为mutate(xyz = ifelse(count <= 30, 'Other', as.character(xyz)))
我会创建一个答案为什么！

标签： r dplyr

【解决方案1】：

如果count <= 30 在所有行中都填满，则不会出现错误。（例如，您可以随意设置count <= 10。

但是如果你需要ifelse 中的FALSE 参数，那么xyz （这是当有count > 30 的值时，你会得到类型不匹配，因此只需更改：

mutate(xyz = ifelse(count <= 30, 'Other', xyz))

到

mutate(xyz = ifelse(count <= 30, 'Other', as.character(xyz)))

【讨论】：

【解决方案2】：

我用gapminder 来展示它是如何工作的。

   library(gapminder)
   
   d1 <- gapminder[1:20,]
   d2 <- gapminder[25:45,]
   d3 <- gapminder[49:65,]
   d4 <- gapminder[73:92,]
   d <- rbind(d1,d2,d3,d4)
   d %>% 
      mutate(xyz = country)  %>%  # I am using it in a function
      group_by(xyz) %>% select(country, xyz) %>% 
      mutate(
         count = n()
      ) %>% distinct() %>% 
      mutate(xyz = if(count <= 10) 'Other' else xyz)

【讨论】：

非常感谢@YBS 的解决方案，它实际上给出了一些警告，但我将其更改为mutate(xyz = ifelse(count <= 3, 'Other', as.character(xyz)) 和as.character，然后它解决了问题。我仍然感谢您的帮助。

【解决方案3】：

如果在 package{tidyverse} 中使用 package{forcats} 会更容易。你不必group_by(xyz) %>% mutate(count = n())。

library(tidyverse)

# make dummy data set 
df1 <- tibble(
  group = factor(c(rep("A", 50), rep("B", 31), rep("C", 30), rep("D", 1)))
)

# check each factor count
summary(df1)

# group 
# A:50  
# B:31  
# C:30  
# D: 1 

# Let's change to "Other" where n <= 30
df2 <- df1 %>%  
  mutate(
    # Preserve levels that appear at least 'min' number of times.
    group = forcats::fct_lump_min(group, min = 31)
  )


# check each factor count
summary(df2)

#  group   
# A    :50  
# B    :31  
# Other:31  

# C and D were integrated into 'Other'

【讨论】：