在 tidyr::extract 中定义正则表达式参数答案

【问题标题】：Defining regex argument in tidyr::extract在 tidyr::extract 中定义正则表达式参数
【发布时间】：2018-05-25 13:50:08
【问题描述】：

我有一列包含用逗号分隔的四个短语的组合，我需要将带有 extract 函数的列从 tidyr 包拆分为四个新列。同时，我希望保持列的顺序，并在原始列之后输入任何新列，而不是在数据集的末尾。新列应该只包含原始列中的一个短语，因此函数separate 在这里不合适。 extract 函数需要正则表达式参数，但我不知道如何定义它。

该列包含以下短语的所有组合：

"Underweight",
"Healthy weight",
"Overweight",
"Very Overweight"

以下是您可以使用上述短语获得的可能组合：

"Underweight",
"Healthy weight",
"Overweight",
"Very Overweight",
"Underweight,Healthy weight",
"Underweight,Overweight",
"Underweight,Very Overweight",
"Healthy weight,Overweight",
"Healthy weight,Very Overweight",
"Overweight,Very Overweight",
"Underweight,Healthy weight,Overweight",
"Underweight,Healthy weight,Very Overweight",
"Underweight,Overweight,Very Overweight",
"Healthy weight,Overweight,Very Overweight",
"Underweight,Healthy weight,Overweight,Very Overweight"

这是从该列中提取的样本（请注意，您可以将变量从因子强制转换为字符）：

structure(list(routine_provided_target = structure(c(15L, 15L, 
15L, NA, 15L, 10L, 15L, 15L, 10L, 15L, NA, NA, NA, NA, 13L, NA, 
10L, 15L, 15L, NA, NA, NA, 15L, NA, NA, NA, 13L, 15L, 9L, 15L, 
15L, NA, NA, NA, NA, 15L, NA, 13L, 4L, 15L, 15L, NA, NA, NA, 
NA, NA, 13L, NA, NA, NA), .Label = c("Underweight", "Healthy weight", 
"Overweight", "Very Overweight", "Underweight,Healthy weight", 
"Underweight,Overweight", "Underweight,Very Overweight", "Healthy weight,Overweight", 
"Healthy weight,Very Overweight", "Overweight,Very Overweight", 
"Underweight,Healthy weight,Overweight", "Underweight,Healthy weight,Very Overweight", 
"Underweight,Overweight,Very Overweight", "Healthy weight,Overweight,Very Overweight", 
"Underweight,Healthy weight,Overweight,Very Overweight"), class = c("ordered", 
"factor")), duration = c(27.6666666666667, 25.45, 16.1166666666667, 
16.85, 17.9333333333333, 34.05, 24.7666666666667, 32.6166666666667, 
21.55, 18.4833333333333, 0.55, 3.23333333333333, 24.3166666666667, 
0.483333333333333, 47.1833333333333, 1.43333333333333, 151.933333333333, 
33.4166666666667, 76.6833333333333, 1.1, 0.65, 1.83333333333333, 
23.1166666666667, 17.8333333333333, 2.61666666666667, 0.683333333333333, 
14.6833333333333, 8.8, 18.8, 12.8833333333333, 17.6833333333333, 
22.8166666666667, 10, 14.2666666666667, 75.7166666666667, 51.4333333333333, 
27.5833333333333, 30.6833333333333, 36.8666666666667, 23.25, 
155.716666666667, 1.73333333333333, 1.41666666666667, 0.233333333333333, 
1.85, 1.35, 25.3666666666667, 0.816666666666667, 6.71666666666667, 
0.75)), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"
))

这是尝试执行我需要的代码；然而，正则表达式参数不能正常工作，它做了一些分离但它是一团糟，它应该包含整个短语：

library(tidyr)
library(dplyr)

sample %>%
  extract(routine_provided_target, c(
    "routine_provided_uw",
    "routine_provided_hw",
    "routine_provided_ow",
    "routine_provided_vow"
  ),
  regex = "([a-zA-Z]+)([a-zA-Z]+)([a-zA-Z]+)([a-zA-Z]+)",
  remove = F
  )

所需的输出接近于以下内容：

structure(list(routine_provided_target = structure(c(15L, 15L, 
15L, NA, 15L, 10L), .Label = c("Underweight", "Healthy weight", 
"Overweight", "Very Overweight", "Underweight,Healthy weight", 
"Underweight,Overweight", "Underweight,Very Overweight", "Healthy weight,Overweight", 
"Healthy weight,Very Overweight", "Overweight,Very Overweight", 
"Underweight,Healthy weight,Overweight", "Underweight,Healthy weight,Very Overweight", 
"Underweight,Overweight,Very Overweight", "Healthy weight,Overweight,Very Overweight", 
"Underweight,Healthy weight,Overweight,Very Overweight"), class = c("ordered", 
"factor")), routine_provided_uw = c("Underwei", "Underwei", "Underwei", 
NA, "Underwei", "Overwei"), routine_provided_hw = c("g", "g", 
"g", NA, "g", "g"), routine_provided_ow = c("h", "h", "h", NA, 
"h", "h"), routine_provided_vow = c("t", "t", "t", NA, "t", "t"
), duration = c(27.6666666666667, 25.45, 16.1166666666667, 16.85, 
17.9333333333333, 34.05)), row.names = c(NA, -6L), class = c("tbl_df", 
"tbl", "data.frame"))

在输出中，您可以看到提取函数在原始列之后和数据集中已经存在的任何列之前创建新列，列的顺序被保留。

感谢您就如何实现这一目标提出任何建议。除了正则表达式行之外的整个代码都应该可以工作。

【问题讨论】：

它不起作用，因为没有指定的, 分隔符。还有，单词之间有空格
@akrun 感谢您的评论，是的，我知道这一点，但我不知道正确的正则表达式来考虑这一点。

标签： r regex tidyr

【解决方案1】：

我假设您想将特定条件提取到特定列中（例如：单词Underweight 应该提取到routine_provided_uw 中）。使用带有特定正则表达式的mutate 来捕获每个条件是实现此目的的最佳方法。

首先，让我们在您的数据中添加一个duration 列，以展示如何以您想要的方式获得列的顺序：

sample <- sample %>%
    mutate(duration = c(seq_len(nrow(sample))))
colnames(sample)
[1] "routine_provided_target" "duration"

现在我们将routine_provided_target 中的数据提取到新列中，并使用select 删除routine_provided_target 列并将duration 移动到末尾。

sample %>%
    mutate(routine_provided_uw = str_extract(routine_provided_target, 'Underweight'),
           routine_provided_hw = str_extract(routine_provided_target, 'Healthy weight'),
           routine_provided_ow = str_extract(routine_provided_target, 'Overweight'),
           routine_provided_vow = str_extract(routine_provided_target, 'Very Overweight')) %>%
select(-c(routine_provided_target, duration), duration)

# A tibble: 50 x 5
   routine_provided_uw routine_provided_hw routine_provided_ow routine_provided_vow duration
   <chr>               <chr>               <chr>               <chr>                   <int>
 1 Underweight         Healthy weight      Overweight          Very Overweight             1
 2 Underweight         Healthy weight      Overweight          Very Overweight             2
 3 Underweight         Healthy weight      Overweight          Very Overweight             3
 4 NA                  NA                  NA                  NA                          4
 5 Underweight         Healthy weight      Overweight          Very Overweight             5
 6 NA                  NA                  Overweight          Very Overweight             6
 7 Underweight         Healthy weight      Overweight          Very Overweight             7
 8 Underweight         Healthy weight      Overweight          Very Overweight             8
 9 NA                  NA                  Overweight          Very Overweight             9
10 Underweight         Healthy weight      Overweight          Very Overweight            10
# ... with 40 more rows

如果您想删除 routine_provided_target 列，您可以使用 transmute 删除除该调用中创建的变量之外的所有其他变量，或使用 select 专门删除该列：

select(-routine_provided_target)

【讨论】：

您好，谢谢。是的，我希望完全按照您的建议“将特定标准提取到特定列中”。我知道这里的 mutate 方法，但没有故意使用它，因为我喜欢 extract 方法，该方法将标准放入它们最初来自的列之后的列中，并允许 remove = FALSE 参数。可悲的是，我确实关心订单，所以我不能使用separate。
你能举一个你想要的输出的例子吗？我不明白您想要的输出与此方法提供的输出有何不同。
谢谢。我添加了一个输出（来自extract 函数）我想在这里实现。完整的数据集很长，我还希望保留列的顺序，我没有选择 mutate 函数，因为它们将新列放在数据集的末尾，而 extract 在原始列之后。
感谢transmute的建议。

【解决方案2】：

您可以使用 separate、gather 和 spread 的整洁三重奏以最少的硬编码来完成此操作。用逗号将routine_provided_target 分隔为 4 个虚拟变量，并将其收集到一个长数据帧中。然后创建将成为新列名称的标签 - 我使用 forcats::fct_recode 完成了此操作，但您可以设置一种不同的方式来轻松地重新标记此向量。然后使用该新列将其转换回宽格式，NAs 填充任何缺失的观察结果。

library(tidyverse)

df %>%
  separate(routine_provided_target, into = c("w1", "w2", "w3", "w4"), sep = ",") %>%
  gather(key = key, value = weight, -duration) %>%
  select(-key) %>%
  filter(!is.na(weight)) %>%
  mutate(provided = as.factor(weight) %>% fct_recode(uw = "Underweight", hw = "Healthy weight", ow = "Overweight", vow = "Very Overweight") %>% sprintf("routine_provided_%s", .)) %>%
  spread(key = provided, value = weight)
#> # A tibble: 25 x 5
#>    duration routine_provided_hw routine_provided_ow routine_provided_uw
#>       <dbl> <chr>               <chr>               <chr>              
#>  1      8.8 Healthy weight      Overweight          Underweight        
#>  2     12.9 Healthy weight      Overweight          Underweight        
#>  3     14.7 <NA>                Overweight          Underweight        
#>  4     16.1 Healthy weight      Overweight          Underweight        
#>  5     17.7 Healthy weight      Overweight          Underweight        
#>  6     17.9 Healthy weight      Overweight          Underweight        
#>  7     18.5 Healthy weight      Overweight          Underweight        
#>  8     18.8 Healthy weight      <NA>                <NA>               
#>  9     21.6 <NA>                Overweight          <NA>               
#> 10     23.1 Healthy weight      Overweight          Underweight        
#> # ... with 15 more rows, and 1 more variable: routine_provided_vow <chr>

编辑：至于设置routine_provided_* 列，这里有一个带有权重示例的演练。在上面的mutate 调用中，我重新编码了weight 的级别，以给出级别的缩写——“Underweight” = “uw”等。

as.factor(weights) %>% 
  fct_recode(uw = "Underweight", hw = "Healthy weight", ow = "Overweight", vow = "Very Overweight")
#>  [1] vow ow  vow hw  hw  hw  uw  ow  uw  uw 
#> Levels: hw ow uw vow

然后我将它通过管道传输到sprintf("routine_provided_%s", .)，其中. 是我正在处理的向量的替代。所以每个级别的字符串替换%s，给你，例如routine_provided_uw。

as.factor(weights) %>% 
  fct_recode(uw = "Underweight", hw = "Healthy weight", ow = "Overweight", vow = "Very Overweight") %>%
  sprintf("routine_provided_%s", .)
#>  [1] "routine_provided_vow" "routine_provided_ow"  "routine_provided_vow"
#>  [4] "routine_provided_hw"  "routine_provided_hw"  "routine_provided_hw" 
#>  [7] "routine_provided_uw"  "routine_provided_ow"  "routine_provided_uw" 
#> [10] "routine_provided_uw"

【讨论】：

谢谢。我添加了num_range(prefix = "w", range = 1:4)，因为在我提供的示例中我没有使用许多其他列。我不知道您可以像这样在 mutate 函数中使用%>%！能否请您简要解释一下sprintf 函数在那里的工作原理？