【发布时间】:2023-03-23 23:27:01
【问题描述】:
我们有一组 5000 个 txt 格式的 KYC 文件。我需要对它们执行 NER,在闪亮的应用程序中以表格的形式总结报告,可以在我们的网站上使用。但是文件的解析需要花费很多时间超过 30 分钟 :(。需要进行优化。任何机构都可以提出一些我需要实现的方法。文本文件是这种格式。
名称 - XYZ 父亲姓名 - abc 地址 - 印度西姆拉商城路 婚姻状况:已婚 年收入 - 750000 卢比 受雇 - 是 担保人 - def 先生 信用分析师评论 - XYZ 没有信用记录。将来可能会成为 NPA ...... 和其他细节
涉及的步骤: 1.预处理文件名。(去除数字,空格,因为它是作为pdf上传然后从网站转换为文本)
创建所有列的索引(姓名、父亲姓名、地址、婚姻状况、年收入、年龄、信用分析师cmets)
解析每个文件的函数。使用命名实体识别和其他技术来获取关键字并忽略其他单词并将其映射到相应的列。函数的名称是 parseAKYC(file)。
在另一个函数 parseallKYC 中调用了这个函数。
当有大量文件时,函数 parseallKYC(files_path) 需要花费太多时间才能完成。有六个文件它可以在几秒钟内给我结果。想要并行使用包。谁能帮我吗 ?显示的大多数示例都是针对 sapply、lapply 的。我们可以使用包并行来并行实现我定义的函数 parseAllKYC 吗?
-
这是最终函数 parseallKYC 的代码,如下所示。
`
#code for parallel parsing library(foreach) library(iterators) library(doParallel) fileloc <- "location of 5000 KYC files" filelist <- list.files(path=fileloc,pattern = 'txt') files <- "" for (j in (1:length(filelist))) { files[j] <- paste0(fileloc,'/',filelist[j]) } no_cores <- detectCores() - 1 cl <- makeCluster(no_cores) registerDoParallel(cl) KYCTable <- foreach(i=iter(files),.combine=rbind) %dopar% { resume <- parseAKYC(i) } stopCluster(cl) #code for parseAKYC function require("NLP") require("openNLPmodels.en") require("openNLP") library(tm) library(DT) preprocessFile <- function(file) { file <- file[!duplicated(file)] file <- gsub("\\f", "", file) file <- gsub('""', "", file) file <- gsub("Page\\d+", "", file) file <- gsub("-+", "", file) file <- file[file != ""] return (file) } extract_People_Location_Org <- function(file) { file <- lapply(file, removePunctuation) file <- unlist(file) s <- as.String(file) sent_token_annotator <- Maxent_Sent_Token_Annotator() gc() word_token_annotator <- Maxent_Word_Token_Annotator() a2 <- annotate(s, list(sent_token_annotator, word_token_annotator)) ## Entity recognition for pepple's names. entity_annotator_people <- Maxent_Entity_Annotator() annotate(s, entity_annotator_people, a2) if (length(entity_annotator_people(s, a2)) == 0) { people_name <- "" } else { people_name <- s[entity_annotator_people(s, a2)] } if (length(people_name) > 1) { people_name <- people_name[!duplicated(people_name)] } result1 <- paste(people_name, collapse = ", ") ## Entity recognition for Location entity_annotator_location <- Maxent_Entity_Annotator(kind = "location") annotate(s, entity_annotator_location, a2) ## Directly: if (length(entity_annotator_location(s, a2)) == 0) { location <- "" } else { location <- s[entity_annotator_location(s, a2)] } if (length(location) > 1) { location <- location[!duplicated(location)] } result2 <- paste(location, collapse = ", ") ## Entity recognition for Organization entity_annotator_org <- Maxent_Entity_Annotator(kind = "organization") annotate(s, entity_annotator_org, a2) if (length(entity_annotator_org(s, a2)) == 0) { org <- "" } else { org <- s[entity_annotator_org(s, a2)] } if (length(org) > 1) { org <- org[!duplicated(org)] } result3 <- paste(org, collapse = ", ") return (c(result1, result2, result3)) } extractCreditAnalystComments <- function(file) { index <- makeIndex(file) CreditAnalystComments <- paste(if (length(which(index == 6)) > 0) file[(which(index == 6)[1] + 1) : (tail(which(index == 6), 1))], collapse = ", ") return (paste(CreditAnalystComments, collapse = ", ")) } makeIndex <- function(file) { # create a blank vector to store index of respective field # CODE: 1-Name 2-Job 3-Email 4-Language 5-Education 6-CreditAnalystCommentss (CreditAnalystCommentss & Expertise) # 7-Experience (Experience, Volunteer Experience, Certifications) # 8-Summary 9-Interests 10-Certifications index <- rep(0, length(file)) index[which(file == "Name")] <- 1 index[which(file == "Address")] <- 2 # index[which(grepl("@", file) == T)] <- 3 index[which(file == "Marital Status")] <- 4 index[which(file == "Annual Income")] <- 5 index[which(file == "Employed")] <- 6 index[which(file == "Guaranter")] <- 7 index[which(file == "CreditAnalystComments")] <- 8 index[which(file == "Interests")] <- 9 index[which(file == "Credit History")] <- 10 for (i in 1:(length(index)-1)) { if (index[i+1] == 0) { index[i+1] <- index[i] } } return (index) } parseAKYC <- function(file_name) { # input: a KYC in format *.txt # read file text file <- readLines(file_name, warn = F) # preprocessing file file <- preprocessFile(file) KYC <- as.list(c("Name" = character(), "CreditAnalystComments" = character(), "Employed" = character(), "Address" = character(), "Annual Income" = character(), "Guaranter" = character())) KYC$Name <- file[1] KYC$CreditAnalystComments <- extractCreditAnalystComments(file) x <- extract_People_Location_Org(file) # ------------------------------------------------------------- CreditAnalystComments.split <- unlist(strsplit(KYC$CreditAnalystComments, split = ",")) CreditAnalystComments.split <- gsub("^\\s+", "", CreditAnalystComments.split) Employed.split <- unlist(strsplit(x[3], split = ",")) Employed.split <- gsub("^\\s+", "", Employed.split) Employed_not_in_credit <- Employed.split[-which(Employed.split %in% CreditAnalystComments.split)] Employed<- paste0(Employed_not_in_CreditAnalystComments, collapse = ", ") # ------------------------------------------------------------- # ------------------------------------------------------------- Guaranter.split <- unlist(strsplit(x[1], split = ",")) Guaranter.split <- gsub("^\\s+", "", Guaranter.split) Guaranter_not_in_CreditAnalystComments <- Guaranter.split[-which(Guaranter.split %in% CreditAnalystComments.split)] Guaranter <- paste0(Guaranter_not_in_CreditAnalystComments, collapse = ", ") # ------------------------------------------------------------- KYC$Employed <- Employed # remember to change Java heap size memory to at leats 2GB KYC$Address <- x[2] #KYC$Designation <- file[2] KYC$Guaranter <- Guaranter return (as.data.frame(KYC, stringsAsFactors = F)) } parseAllKYC <- function(files_path) { KYC .df <- data.frame(Name = character(), FatherName = character(), Address = character(), maritalstatus = character(), Annualincome = character(), CreditAnalystComments= character(),stringsAsFactors=FALSE) for (i in files_path) { KYC <- parseAKYC(i) KYC.df <- rbind(KYC.df, KYC) } return (KYC.df) } #ui.R fluidPage(fluidRow(column(12,DT::dataTableOutput('tbl')))) #server.R library(shiny) library(DT) source("getKYCTable.R") function(input, output, session) { output$tbl =DT::renderDataTable(KYCTable ,filter ='top',options = list(lengthChange = FALSE) ) } `
【问题讨论】:
-
你想在多核上运行代码吗?
-
是的..试过了。收到错误。这是使用的代码。
-
您能否确保正确缩进您的代码 (parseAllKYC)。另外,我没有在您的 parseAllKYC 函数中看到 parseAResume。
-
我确实试过了。使用 CTRL+K 不工作.....对不起。它只是 parseAKYC 函数。感谢您指出。
-
@Tushar ,要我粘贴 parseAKYC 函数吗? .