最简单的解决方案通常是最快的!
这是我的建议:
str = paste0(ac, collapse="|")
df$id[grep(str, df$description)]
但你也可以这样
df$id[as.logical(rowSums(!is.na(sapply(ac, function(x) stringr::str_match(df$description, x)))))]
或者这样
df$id[grepl(str, df$description, perl=T)]
但是,必须进行比较。顺便说一句,我添加了来自@Andre Wildberg 和@Martina C. Arnolda 的建议。
以下是基准。
str = paste0(ac, collapse="|")
fFiolka1 = function() df$id[grep(str, df$description)]
fFiolka2 = function() df$id[as.logical(rowSums(!is.na(sapply(ac, function(x) stringr::str_match(df$description, x)))))]
fFiolka3 = function() df$id[grepl(str, df$description, perl=T)]
fWildberg1 = function() df$id[unlist(sapply(ac, function(x) grep(x, df$description)))]
fWildberg2 = function() df$id[as.logical(rowSums(sapply(ac, function(x) stri_detect_regex(df$description, x))))]
fArnolda1 = function() df[grep(str, df$description), ]["id"]
fArnolda2 = function() df[stringi::stri_detect_regex(df$description, str), ]["id"]
fArnolda3 = function() df %>% filter(description %>% str_detect(str)) %>% select(id)
library(microbenchmark)
ggplot2::autoplot(microbenchmark(
fFiolka1(), fFiolka2(), fFiolka3(),
fWildberg1(), fWildberg2(),
fArnolda1(), fArnolda2(), fArnolda3(),
times=100))
请注意,为简单起见,我将 ac 保留为向量!。
ac <- c("san francisco ca", "pittsburgh pa", "philadelphia pa", "washington dc", "new york ny", "aliquippa pa", "gainesville fl", "manhattan ks")
@jvalenti 的特别更新
好的。现在我更好地理解了你想要达到的目标。但是,为了充分展示最佳解决方案,我稍微修改了您的数据。他们来了
library(tidyverse)
ac <- c("san francisco ca", "pittsburgh pa", "philadelphia pa", "washington dc", "new york ny", "aliquippa pa", "gainesville fl", "manhattan ks")
ac = tibble(ac = ac)
df = structure(list(
month = c(202110L, 201910L, 202005L, 201703L, 201208L, 201502L),
id = c(100559687L, 100558763L, 100558934L, 100558946L, 100543422L, 100547618L),
description = c(
"residential local telephone pittsburgh pa local with more san francisco ca flat rate with eas philadelphia pa plan includes voicemail call forwarding call waiting caller id call restriction three way calling id block speed dialing call return call screening modem rental voip transmission telephone access line 34 95 modem rental 7 00 total 41 95",
"digital video san francisco ca pittsburgh pa multilatino ultra bensalem pa service includes digital economy multilatino digital preferred tier and certain additonal digital channels coaxial cable transmission",
"residential all distance telephone pittsburgh pa unlimited voice only harrisburg pa flat rate with eas only features call waiting caller id caller id with call waiting call screening call forwarding call forwarding selective call return 69 3 way calling anonymous call rejection repeat dialing speed dial caller id blocking coaxial cable transmission",
"residential all distance telephone pittsburgh pa unlimited voice philadelphia pa san francisco ca pa flat rate with eas only features call waiting caller id caller id with call waiting call screening call forwarding call forwarding selective call return 69 3 way calling anonymous call rejection repeat dialing speed dial caller id blocking",
"local spot advertising 30 second advertisement austin tx weekday 6 am 6 pm other audience demographic w18 49 number of rating points for daypart 0 29 average cpp 125",
"residential public switched toll pittsburgh pa manhattan ks ks plan area residence switched toll base san philadelphia pa ca average revenue per minute 0 18 minute online"
)), row.names = c(1L, 1245L, 3800L, 10538L, 20362L, 50000L), class = "data.frame")
您将在下面找到四种不同的解决方案。一种基于for 循环,两种解决方案基于dplyr 包中的函数,以及collapse 包中的函数。
fSolition1 = function(){
id = vector("list", nrow(ac))
for(i in seq_along(ac$ac)){
id[[i]] = df$id[grep(ac$ac[i], df$description)]
}
ac %>% mutate(id = id) %>% unnest(id)
}
fSolition1()
fSolition2 = function(){
ac %>% group_by(ac) %>%
mutate(id = list(df$id[grep(ac, df$description)])) %>%
unnest(id)
}
fSolition2()
fSolition3 = function(){
ac %>% rowwise(ac) %>%
mutate(id = list(df$id[grep(ac, df$description)])) %>%
unnest(id)
}
fSolition3()
fSolition4 = function(){
ac %>%
collapse::ftransform(id = lapply(ac, function(x) df$id[grep(x, df$description)])) %>%
unnest(id)
}
fSolition4()
请注意,对于给定的数据,所有返回下表作为结果的函数
# A tibble: 12 x 2
ac id
<chr> <int>
1 san francisco ca 100559687
2 san francisco ca 100558763
3 san francisco ca 100558946
4 pittsburgh pa 100559687
5 pittsburgh pa 100558763
6 pittsburgh pa 100558934
7 pittsburgh pa 100558946
8 pittsburgh pa 100547618
9 philadelphia pa 100559687
10 philadelphia pa 100558946
11 philadelphia pa 100547618
12 manhattan ks 100547618
是时候进行基准测试了
library(microbenchmark)
ggplot2::autoplot(microbenchmark(
fSolition1(), fSolition2(), fSolition3(), fSolition4(), times=100))
对于任何人来说,基于collapse 的解决方案是最快的,这可能并不奇怪。然而,第二名可能是一个很大的惊喜。基于 for 函数的旧解决方案排在第二位! 还有人想说 for 很慢吗?
@Gwang-Jin Kim 的特别更新
对向量的操作没有太大变化。往下看。
df_ac = ac$ac
df_decription = df$description
df_id = df$id
fSolition5 = function(){
id = vector("list", length = length(df_ac))
for(i in seq_along(df_ac)){
id[[i]] = df_id[grep(df_ac[i], df_decription)]
}
ac %>% mutate(id = id) %>% unnest(id)
}
fSolition5()
library(microbenchmark)
ggplot2::autoplot(microbenchmark(
fSolition1(), fSolition2(), fSolition3(), fSolition4(), fSolition5(), times=100))
但是for 和ftransform 的组合可能会令人惊讶!!!
fSolition6 = function(){
id = vector("list", nrow(ac))
for(i in seq_along(ac$ac)){
id[[i]] = df$id[grep(ac$ac[i], df$description)]
}
ac %>% collapse::ftransform(id = id) %>% unnest(id)
}
fSolition6()
library(microbenchmark)
ggplot2::autoplot(microbenchmark(
fSolition1(), fSolition2(), fSolition3(), fSolition4(), fSolition5(), fSolition6(), times=100))
@jvalenti 的最新更新
亲爱的 jvaleniti,在您的问题中,您写道 我在一个数据框中有一列包含城市和州名,然后 我将使用超过 10 万行。我的结论是,给定城市很可能会在您的变量description 中出现多次。
但是,在您写的评论中 我不想更改 ac 中的行数
那么你期待什么样的结果呢?让我们看看可以用它做什么。
解决方案 1 - 我们将所有 id 作为向量列表返回
ac %>% collapse::ftransform(id = map(ac, ~df$id[grep(.x, df$description)]))
# # A tibble: 8 x 2
# ac id
# * <chr> <list>
# 1 san francisco ca <int [3]>
# 2 pittsburgh pa <int [5]>
# 3 philadelphia pa <int [3]>
# 4 washington dc <int [0]>
# 5 new york ny <int [0]>
# 6 aliquippa pa <int [0]>
# 7 gainesville fl <int [0]>
# 8 manhattan ks <int [1]>
解决方案 2 - 我们只返回第一个 id
ac %>% collapse::ftransform(id = map_int(ac, ~df$id[grep(.x, df$description)][1]))
# # A tibble: 8 x 2
# ac id
# * <chr> <int>
# 1 san francisco ca 100559687
# 2 pittsburgh pa 100559687
# 3 philadelphia pa 100559687
# 4 washington dc NA
# 5 new york ny NA
# 6 aliquippa pa NA
# 7 gainesville fl NA
# 8 manhattan ks 100547618
解决方案 3 - 我们只返回最后一个 id
ac %>%
collapse::ftransform(id = map_int(ac, function(x) {
idx = grep(x, df$description)
ifelse(length(idx)>0, df$id[idx[length(idx)]], NA)}))
# # A tibble: 8 x 2
# ac id
# * <chr> <int>
# 1 san francisco ca 100558946
# 2 pittsburgh pa 100547618
# 3 philadelphia pa 100547618
# 4 washington dc NA
# 5 new york ny NA
# 6 aliquippa pa NA
# 7 gainesville fl NA
# 8 manhattan ks 100547618
解决方案 4 - 或者您可能想从所有可能的选项中选择任何 id
ac %>%
collapse::ftransform(id = map_int(ac, function(x) {
idx = grep(x, df$description)
ifelse(length(idx)==0, NA, ifelse(length(idx)==1, df$id[idx], df$id[sample(idx, 1)]))}))
# # A tibble: 8 x 2
# ac id
# * <chr> <int>
# 1 san francisco ca 100558763
# 2 pittsburgh pa 100559687
# 3 philadelphia pa 100547618
# 4 washington dc NA
# 5 new york ny NA
# 6 aliquippa pa NA
# 7 gainesville fl NA
# 8 manhattan ks 100547618
解决方案 5 - 如果您不小心想查看所有 id 并希望同时保留 ac 行数
ac %>%
collapse::ftransform(id = map(ac, function(x) {
idx = grep(x, df$description)
if(length(idx)==0) tibble(id = NA, idn = "id1") else tibble(
id = df$id[idx],
idn = paste0("id",1:length(id)))})) %>%
unnest(id) %>%
pivot_wider(ac, names_from = idn, values_from = id)
# # A tibble: 8 x 6
# ac id1 id2 id3 id4 id5
# <chr> <int> <int> <int> <int> <int>
# 1 san francisco ca 100559687 100558763 100558946 NA NA
# 2 pittsburgh pa 100559687 100558763 100558934 100558946 100547618
# 3 philadelphia pa 100559687 100558946 100547618 NA NA
# 4 washington dc NA NA NA NA NA
# 5 new york ny NA NA NA NA NA
# 6 aliquippa pa NA NA NA NA NA
# 7 gainesville fl NA NA NA NA NA
# 8 manhattan ks 100547618 NA NA NA NA
很遗憾,您提供的描述并未表明上述五种解决方案中的哪一种是您可以接受的解决方案。您必须自己决定。