我实际上会编写一个如下所示的函数:
NewSplit <- function(indf, splitCols, sep = ",") {
Keys <- setdiff(names(indf), splitCols)
if (any(!vapply(indf[splitCols], is.character, logical(1L)))) {
indf[splitCols] <- lapply(indf[splitCols], as.character)
}
X <- setNames(lapply(indf[splitCols], function(x) {
strsplit(x, split = sep, fixed = TRUE)
}), splitCols)
Rep <- vapply(X[[1]], length, integer(1L))
cbind(indf[rep(rownames(indf), Rep), Keys],
lapply(X, unlist),
row.names = NULL,
stringsAsFactors = FALSE)
}
可以这样使用:
NewSplit(dat, c("start", "end"), ",")
# header1 header2 id start end
# 1 A F 1 1 99
# 2 A F 1 100 199
# 3 A F 1 200 299
# 4 B G 1 11 33
# 5 B G 1 222 444
# 6 C H 1 10 72
# 7 D I 1 7 10
# 8 D I 1 8 9
# 9 D I 1 9 8
# 10 D I 1 10 7
# 11 D I 1 11 6
# 12 E J 1 1 3
其中“dat”定义为:
dat <- data.frame(
header1 = LETTERS[1:5], header2 = LETTERS[6:10],
start = c("1,100,200", "11,222", "10", "7,8,9,10,11", "1"),
end = c("99,199,299", "33,444", "72", "10,9,8,7,6", "3"))
dat$id <- with(dat,
ave(rep(1, nrow(dat)),
header1, header2,
FUN = seq_along))
这实际上是一个非常快的函数,因为使用的基本函数非常快。这是与 50K 行的“data.table”答案的比较。
将原始数据集扩展到 50K 行
dat2 <- do.call(rbind, replicate(10000, dat, FALSE))
dat2$id <- with(dat2,
ave(rep(1, nrow(dat2)),
header1, header2,
FUN = seq_along))
dim(dat2)
# [1] 50000 5
dt <- as.data.table(dat2)
创建几个函数来测试(为了方便)
fun1 <- function(dt = dt) {
dt[, list(
start = strsplit(as.character(start) , ",", fixed=TRUE)[[1]],
end = strsplit(as.character(end) , ",", fixed=TRUE)[[1]]),
by = list(header1, header2, id)]
}
fun2 <- function(df = dat2) {
NewSplit(df, c("start", "end"), ",")
}
检查它们是否相等
all.equal(as.data.frame(fun1(dt)), fun2(dat2))
# [1] TRUE
比较时间
system.time(fun1(dt))
# user system elapsed
# 1.953 0.009 1.999
system.time(fun2(dat2))
# user system elapsed
# 0.286 0.001 0.288