您将在下面逐步找到如何计算output 列的方法。
请注意,我还添加了一个事件大于 0 的用户,这导致 NA 作为最大值 rowID 并在额外分配中处理。
> df <- read.table(header = TRUE, sep=",", text = "rowID, userID, event
+ 1, 999, 0
+ 2, 999, 1
+ 3, 999, 0
+ 4, 100, 0
+ 5, 100, 1
+ 6, 100, 0
+ 7, 100, 1
+ 8, 100, 0
+ 9, 100, 0
+ 10, 101, 0
+ 11, 101, 0
+ 12, 102, 1
+ ")
>
> ## filter events
> df1 <- df[df$event > 0,]
> ## calculate max rowID per user
> max <- setNames(aggregate(df1$rowID, by = list(df1$userID), max) , c("userID", "maxRowID"))
> max
userID maxRowID
1 100 7
2 102 12
3 999 2
>
> ## merge the max to the dataframe
> mrg <- merge(x = df, y = max, by = "userID" , all.x = TRUE)
> ## establish the original order
> mrg <- mrg[with(mrg, order(rowID)), ]
> mrg
userID rowID event maxRowID
10 999 1 0 2
11 999 2 1 2
12 999 3 0 2
1 100 4 0 7
2 100 5 1 7
5 100 6 0 7
6 100 7 1 7
3 100 8 0 7
4 100 9 0 7
7 101 10 0 NA
8 101 11 0 NA
9 102 12 1 12
>
> ## calculate output,
> output <- ifelse( mrg$rowID > mrg$maxRowID,'after','before')
> ## consider also case with no event > 0
> output[is.na(output)] <- 'before'
>
> ## add the output column to the original dataframe
> df$output <- output
> df
rowID userID event output
1 1 999 0 before
2 2 999 1 before
3 3 999 0 after
4 4 100 0 before
5 5 100 1 before
6 6 100 0 before
7 7 100 1 before
8 8 100 0 after
9 9 100 0 after
10 10 101 0 before
11 11 101 0 before
12 12 102 1 before
>