我们做一个join,然后arrange by 'id' 和factor 按照指定的顺序用levels 转换'status',按'id'分组,得到slice的第一行
library(dplyr)
left_join(df_2, df_1) %>%
arrange(id, factor(status, levels = c('IN', 'PENDING', 'REFER', 'OUT'))) %>%
group_by(id) %>%
slice(1) %>%
ungroup %>%
select(-status)
# A tibble: 5 x 2
# id name
# <dbl> <fct>
#1 1 a
#2 2 b
#3 3 c
#4 4 d1
#5 5 e
如果 'df_1' 重复,则使用 distinct 行执行 left_join,稍后再执行 right_join
df_1 <- data.frame(id=c(1,1,2,3,4,5,5))
left_join(df_2, distinct(df_1)) %>%
arrange(id, factor(status, levels = c('IN', 'PENDING', 'REFER', 'OUT'))) %>%
group_by(id) %>%
slice(1) %>%
ungroup %>%
select(-status) %>%
right_join(df_1)
# A tibble: 7 x 2
# id name
#* <dbl> <fct>
#1 1 a
#2 1 a
#3 2 b
#4 3 c
#5 4 d1
#6 5 e
#7 5 e
或者我们可以有一个list columnornestthe 'id' and theunnest`
library(tidyr)
df_1 %>%
group_by(id) %>%
nest %>%
right_join(df_2) %>%
arrange(id, factor(status, levels = c('IN', 'PENDING', 'REFER', 'OUT'))) %>%
group_by(id) %>%
slice(1) %>%
ungroup %>%
select(-status) %>%
unnest
或使用data.table
library(data.table)
setDT(df_2)[df_1, .(name = name[order(match(status,
c('IN', 'PENDING', 'REFER', 'OUT')))[1]]), on = .(id), by = .EACHI]
# id name
#1: 1 a
#2: 2 b
#3: 3 c
#4: 4 d1
#5: 5 e
或者base R选项是先order'df_2'数据集,然后根据'id'中的duplicated元素提取'name'的元素
df_2n <- df_2[order(df_2$id, factor(df_2$status, levels = c('IN', 'PENDING', 'REFER', 'OUT'))),]
df_1$name <- df_2n$name[!duplicated(df_2$id)]