两种可能的解决方案:
1) 使用基础 R:
idx <- ave(df$or, df$id, FUN = function(x) x=='base' & c('base',head(x,-1))=='plan' & c(tail(x,-1),'base')=='plan')=='FALSE'
df[idx,]
给出:
id fd rem or
1 1 101 100 base
2 1 102 120 base
3 1 103 120 plan
5 1 105 140 plan
6 1 106 150 base
7 2 101 200 plan
9 2 103 220 plan
11 2 105 300 plan
12 2 106 310 base
2) 使用data.table-package:
library(data.table)
setDT(df)
idx <- df[, .I[!(or=='base' & shift(or, fill = 'base')=='plan' & shift(or, fill = 'base', type = 'lead')=='plan')], id]$V1
df[idx]
给出:
id fd rem or
1: 1 101 100 base
2: 1 102 120 base
3: 1 103 120 plan
4: 1 105 140 plan
5: 1 106 150 base
6: 2 101 200 plan
7: 2 103 220 plan
8: 2 105 300 plan
9: 2 106 310 base
或者一口气:
library(data.table)
setDT(df)[df[, .I[!(or=='base' & shift(or, fill = 'base')=='plan' & shift(or, fill = 'base', type = 'lead')=='plan')], id]$V1]
针对评论,您可以使用rle-function 来检测'base'-rows 之间的多个'base'-rows,如下所示(在base R中):
# create new example dataset
df2 <- df[c(1:3,4,4,5:7,8,8,9:12),]
# the new example dataset:
> df2
id fd rem or
1 1 101 100 base
2 1 102 120 base
3 1 103 120 plan
4 1 104 140 base
4.1 1 104 140 base
5 1 105 140 plan
6 1 106 150 base
7 2 101 200 plan
8 2 102 220 base
8.1 2 102 220 base
9 2 103 220 plan
10 2 104 250 base
11 2 105 300 plan
12 2 106 310 base
# define function
f <- function(x) {
rl <- rle(x)
rl$values <- !(rl$values == 'base' & c('base',head(rl$values,-1))=='plan' & c(tail(rl$values,-1),'base')=='plan')
inverse.rle(rl)
}
# apply the function to each id-group and create an index
idx2 <- as.logical(ave(df2$or, df2$id, FUN = f))
# finally subset your data with the logical-index
df2[idx2,]
给出:
> df2[idx2,]
id fd rem or
1 1 101 100 base
2 1 102 120 base
3 1 103 120 plan
5 1 105 140 plan
6 1 106 150 base
7 2 101 200 plan
9 2 103 220 plan
11 2 105 300 plan
12 2 106 310 base
base R 中的另一个选项(受@Frank 在 cmets 中的 data.table 建议的启发):
f2 <- function(x) {
i <- seq_along(x)
w <- which(x == 'plan')
b <- which(x == 'base')
ib <- b[b > head(w,1) & b < tail(w,1)]
!(i %in% ib)
}
idx3 <- unlist(by(df2$or, df2$id, f2))
df2[idx3,]
使用data.table,您可以遵循@Frank 的建议:
setDT(df2)
df2[, keep := {isp = or == "plan"; wp = which(isp); r = 1:.N; isp | r < first(wp) | r > last(wp)}, by = id
][!!keep]
使用过的数据:
df <- structure(list(id = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
fd = c(101, 102, 103, 104, 105, 106, 101, 102, 103, 104, 105, 106),
rem = c(100, 120, 120, 140, 140, 150, 200, 220, 220, 250, 300, 310),
or = c("base", "base", "plan", "base", "plan", "base", "plan", "base", "plan", "base", "plan", "base")),
.Names = c("id", "fd", "rem", "or"), row.names = c(NA, -12L), class = "data.frame")