一个基准(将 sub_a 和 sub_b 创建添加到 Sotos 和 Heikki 答案中,这样每个人都从相同的初始向量开始:a 的 800 次观察和 b 的 1000 次观察)。
运行基准测试:
library(dplyr)
library(data.table)
library(microbenchmark)
a <- sample(100:999, 8e3, TRUE)
b <- sample(100:999, 1e4, TRUE)
microbenchmark(Jaap1(a,b), Jaap2(a,b), Tensi(a,b), Heikki(a,b), Sotos(a,b),
Matt_base(a,b), Matt_dplyr(a,b), Docendo(a,b),
zx8754(a,b), zx8754for(a,b), Frank(a,b),
times = 50, unit = 'relative')
给予:
Unit: relative
expr min lq mean median uq max neval cld
Jaap1(a, b) 19.668483 19.316194 17.2373827 18.921573 18.829932 7.8792713 50 d
Jaap2(a, b) 4.253151 4.365420 4.0557281 4.309247 4.398149 2.2149125 50 b
Tensi(a, b) 241.682216 238.197815 212.2844582 233.473689 233.367619 93.3562331 50 h
Heikki(a, b) 114.895836 113.754054 101.2781709 111.637570 110.541708 44.9437229 50 g
Sotos(a, b) 27.598767 28.725937 25.7469518 28.534011 28.638413 11.6995642 50 e
Matt_base(a, b) 19.159883 18.834180 16.8853660 18.513498 18.416194 7.8329323 50 d
Matt_dplyr(a, b) 1.108230 1.106051 1.0203776 1.102078 1.098476 1.0131898 50 a
Docendo(a, b) 1.000000 1.000000 1.0000000 1.000000 1.000000 1.0000000 50 a
zx8754(a, b) 11.601730 12.986763 11.7859245 13.054720 13.234842 5.6944437 50 c
zx8754for(a, b) 90.448168 92.906445 82.4905438 91.092609 90.160010 36.1277145 50 f
Frank(a, b) 1.070775 1.070202 0.9621499 1.063978 1.055540 0.4459918 50 a
用到的功能:
Jaap1 <- function(a,b) {
a <- setNames(a, substr(a,2,3))
b <- setNames(b, substr(b,1,2))
df <- merge(stack(a), stack(b), by = 'ind')
paste0(substr(df$values.x,1,1), df$values.y)
}
Jaap2 <- function(a,b) {
a <- setNames(a, substr(a,2,3))
b <- setNames(b, substr(b,1,2))
l <- lapply(names(a), function(x) b[x == names(b)])
paste0(substr(rep(a, lengths(l)),1,1), unlist(l))
}
Tensi <- function(a,b) {
unlist(sapply(a,function(x) {regex <- paste0(substr(x,2,3),'(\\d)'); z <- sub(regex,paste0(x,"\\1"),b); z[!b %in% z] } ))
}
Heikki <- function(a,b) {
sub_a <- substr(a, 2, 3)
sub_b <- substr(b, 1, 2)
result <- c()
for (ai in a) {
sub_ai <- substr(ai,2,3)
if (sub_ai %in% sub_a) {
b_match <- (sub_b == sub_ai)
result <- c(result,paste0(ai,substr(b[b_match],3,4)))
}
}
result
}
Sotos <- function(a,b) {
sub_a <- substr(a, 2, 3)
sub_b <- substr(b, 1, 2)
d1 <- expand.grid(a, b, stringsAsFactors = FALSE)
d2 <- expand.grid(sub_a, sub_b, stringsAsFactors = FALSE)
i1 <- d2$Var1 == d2$Var2
d1 <- d1[i1,]
d1$Var1 <- substr(d1$Var1, 1, 1)
do.call(paste0, d1)
}
Matt_base <- function(a,b) {
a1 <- data.frame(a)
b1 <- data.frame(b)
a1$first_a = substr(a1$a, 1, 1)
a1$last_a = substr(a1$a, 2, 3)
b1$first_b = substr(b1$b, 1, 2)
b1$last_b = substr(b1$b, 3, 3)
c1 <- merge(a1, b1, by.x = "last_a", by.y = "first_b")
results <- paste0(c1$a, c1$last_b)
}
Matt_dplyr <- function(a,b) {
a1 <- data.frame(a)
b1 <- data.frame(b)
a1 <- a1 %>% mutate(first_a = substr(a, 1, 1), last_a = substr(a, 2, 3))
b1 <- b1 %>% mutate(first_b = substr(b, 1, 2), last_b = substr(b, 3, 3))
c1 <- inner_join(a1, b1, by = c("last_a" = "first_b"))
results <- paste0(c1$a, c1$last_b)
}
Docendo <- function(a, b) {
split_a <- split(a, substr(a, 2, 3))
split_b <- split(substr(b, 3, 3), substr(b, 1, 2))
idx <- intersect(names(split_a), names(split_b))
stopifnot(length(idx) > 0)
unlist(Map(function(x,y) outer(x, y, paste0), split_a[idx], split_b[idx]),
use.names = FALSE)
}
zx8754 <- function(a, b) {
unlist(sapply(a, function(i) i * 10 + (b %% 10)[i %% 100 == b %/% 10]))
}
zx8754for <- function(a, b) {
res <- integer()
for(i in a) res <- c(res, i * 10 + (b %% 10)[i %% 100 == b %/% 10])
res
}
Frank <- function(a, b) {
aDT <- as.data.table(tstrsplit(a, ""))
bDT <- setnames(as.data.table(tstrsplit(b, "")), c("V2", "V3", "V4"))
merge(aDT, bDT, allow.cartesian = TRUE)[, paste0(V1, V2, V3, V4)]
}