R代码中的性能问题答案

【问题标题】：Performance issue in R CodeR代码中的性能问题
【发布时间】：2023-03-23 23:27:01
【问题描述】：

我们有一组 5000 个 txt 格式的 KYC 文件。我需要对它们执行 NER，在闪亮的应用程序中以表格的形式总结报告，可以在我们的网站上使用。但是文件的解析需要花费很多时间超过 30 分钟 :(。需要进行优化。任何机构都可以提出一些我需要实现的方法。文本文件是这种格式。

名称 - XYZ 父亲姓名 - abc 地址 - 印度西姆拉商城路婚姻状况：已婚年收入 - 750000 卢比受雇 - 是担保人 - def 先生信用分析师评论 - XYZ 没有信用记录。将来可能会成为 NPA ...... 和其他细节

涉及的步骤： 1.预处理文件名。（去除数字，空格，因为它是作为pdf上传然后从网站转换为文本）

创建所有列的索引（姓名、父亲姓名、地址、婚姻状况、年收入、年龄、信用分析师cmets）
解析每个文件的函数。使用命名实体识别和其他技术来获取关键字并忽略其他单词并将其映射到相应的列。函数的名称是 parseAKYC(file)。
在另一个函数 parseallKYC 中调用了这个函数。
当有大量文件时，函数 parseallKYC(files_path) 需要花费太多时间才能完成。有六个文件它可以在几秒钟内给我结果。想要并行使用包。谁能帮我吗？显示的大多数示例都是针对 sapply、lapply 的。我们可以使用包并行来并行实现我定义的函数 parseAllKYC 吗？

这是最终函数 parseallKYC 的代码，如下所示。

#code for parallel parsing 
library(foreach) 
library(iterators)
library(doParallel)
fileloc <- "location of 5000 KYC files"
filelist <- list.files(path=fileloc,pattern = 'txt')
files <- ""
for (j in (1:length(filelist)))
{
  files[j] <- paste0(fileloc,'/',filelist[j])
}
no_cores <- detectCores() - 1
cl <- makeCluster(no_cores)
registerDoParallel(cl)
KYCTable <- foreach(i=iter(files),.combine=rbind) %dopar% 
{
  resume <- parseAKYC(i)
}
stopCluster(cl)

#code for parseAKYC function
require("NLP")
require("openNLPmodels.en")
require("openNLP")
library(tm)
library(DT)

preprocessFile <- function(file) {
  file <- file[!duplicated(file)]
  file <- gsub("\\f", "", file)
  file <- gsub('""', "", file)
  file <- gsub("Page\\d+", "", file) 
  file <- gsub("-+", "", file)
  file <- file[file != ""]
  return (file)
}
extract_People_Location_Org <- function(file) {
  file <- lapply(file, removePunctuation)
  file <- unlist(file)
  s <- as.String(file)
  sent_token_annotator <- Maxent_Sent_Token_Annotator()
  gc()
  word_token_annotator <- Maxent_Word_Token_Annotator()
  a2 <- annotate(s, list(sent_token_annotator, word_token_annotator))

  ## Entity recognition for pepple's names.
  entity_annotator_people <- Maxent_Entity_Annotator()
  annotate(s, entity_annotator_people, a2)
  if (length(entity_annotator_people(s, a2)) == 0) {
    people_name <- ""
  } else {
    people_name <- s[entity_annotator_people(s, a2)]
  }
  if (length(people_name) > 1) {
    people_name <- people_name[!duplicated(people_name)]
  }
  result1 <- paste(people_name, collapse = ", ")

  ## Entity recognition for Location
  entity_annotator_location <- Maxent_Entity_Annotator(kind =   "location")
  annotate(s, entity_annotator_location, a2)
  ## Directly:
  if (length(entity_annotator_location(s, a2)) == 0) {
    location <- ""
  } else {
    location <- s[entity_annotator_location(s, a2)]
  }
  if (length(location) > 1) {
    location <- location[!duplicated(location)]
  }
  result2 <- paste(location, collapse = ", ")

  ## Entity recognition for Organization
  entity_annotator_org <- Maxent_Entity_Annotator(kind = "organization")
  annotate(s, entity_annotator_org, a2)
  if (length(entity_annotator_org(s, a2)) == 0) {
    org <- ""
  } else {
    org <- s[entity_annotator_org(s, a2)]
  }
  if (length(org) > 1) {
    org <- org[!duplicated(org)]
  }

  result3 <- paste(org, collapse = ", ")
  return (c(result1, result2, result3))
}
extractCreditAnalystComments <- function(file) {
  index <- makeIndex(file)
  CreditAnalystComments <- paste(if (length(which(index == 6)) > 0) file[(which(index == 6)[1] + 1) : (tail(which(index == 6), 1))], collapse = ", ")
  return (paste(CreditAnalystComments, collapse = ", "))
}
makeIndex <- function(file) {
  # create a blank vector to store index of respective field
  # CODE: 1-Name 2-Job 3-Email 4-Language 5-Education 6-CreditAnalystCommentss (CreditAnalystCommentss & Expertise) 
  #       7-Experience (Experience, Volunteer Experience, Certifications)
  #       8-Summary 9-Interests  10-Certifications

  index <- rep(0, length(file))
  index[which(file == "Name")] <- 1
  index[which(file == "Address")] <- 2
  # index[which(grepl("@", file) == T)] <- 3
  index[which(file == "Marital Status")] <- 4
  index[which(file == "Annual Income")] <- 5
  index[which(file == "Employed")] <- 6
  index[which(file == "Guaranter")] <- 7
  index[which(file == "CreditAnalystComments")] <- 8
  index[which(file == "Interests")] <- 9
  index[which(file == "Credit History")] <- 10

  for (i in 1:(length(index)-1)) {
    if (index[i+1] == 0) {
      index[i+1] <- index[i]
    }
  }
  return (index)
}
parseAKYC <- function(file_name) {
  # input: a KYC in format *.txt


  # read file text
  file <- readLines(file_name, warn = F)

  # preprocessing file 
  file <- preprocessFile(file)

  KYC <- as.list(c("Name" = character(), "CreditAnalystComments" = character(), "Employed" = character(), 
              "Address" = character(), "Annual Income" = character(), 
              "Guaranter" = character()))
  KYC$Name <- file[1]
  KYC$CreditAnalystComments <- extractCreditAnalystComments(file)
  x <- extract_People_Location_Org(file)
  # -------------------------------------------------------------

  CreditAnalystComments.split <- unlist(strsplit(KYC$CreditAnalystComments, split = ","))
  CreditAnalystComments.split <- gsub("^\\s+", "", CreditAnalystComments.split)
  Employed.split <- unlist(strsplit(x[3], split = ","))
  Employed.split <- gsub("^\\s+", "", Employed.split)
  Employed_not_in_credit <- Employed.split[-which(Employed.split %in% CreditAnalystComments.split)]
  Employed<- paste0(Employed_not_in_CreditAnalystComments, collapse = ", ")
  # -------------------------------------------------------------

  # -------------------------------------------------------------

  Guaranter.split <- unlist(strsplit(x[1], split = ","))
  Guaranter.split <- gsub("^\\s+", "", Guaranter.split)
  Guaranter_not_in_CreditAnalystComments <- Guaranter.split[-which(Guaranter.split %in% CreditAnalystComments.split)]
  Guaranter <- paste0(Guaranter_not_in_CreditAnalystComments, collapse = ", ")
  # -------------------------------------------------------------

  KYC$Employed <- Employed
  # remember to change Java heap size memory to at leats 2GB
  KYC$Address <- x[2]
  #KYC$Designation <- file[2]
  KYC$Guaranter <- Guaranter
  return (as.data.frame(KYC, stringsAsFactors = F))
}
    parseAllKYC <- function(files_path) {
          KYC .df <- data.frame(Name = character(),  FatherName = character(),   
                       Address = character(), maritalstatus = character(),  
                       Annualincome = character(),  
                       CreditAnalystComments= character(),stringsAsFactors=FALSE)

          for (i in files_path) {
            KYC <- parseAKYC(i)
            KYC.df <- rbind(KYC.df, KYC)
          }
          return (KYC.df)
        }





         #ui.R

        fluidPage(fluidRow(column(12,DT::dataTableOutput('tbl'))))




             #server.R
                library(shiny)
                library(DT)
                source("getKYCTable.R")
                function(input, output, session)
                {
                  output$tbl =DT::renderDataTable(KYCTable ,filter ='top',options = list(lengthChange = FALSE)
                  )
                }




    `

【问题讨论】：

你想在多核上运行代码吗？
是的..试过了。收到错误。这是使用的代码。
您能否确保正确缩进您的代码 (parseAllKYC)。另外，我没有在您的 parseAllKYC 函数中看到 parseAResume。
我确实试过了。使用 CTRL+K 不工作.....对不起。它只是 parseAKYC 函数。感谢您指出。
@Tushar ，要我粘贴 parseAKYC 函数吗？ .

标签： r shiny

【解决方案1】：

试试这个：虽然我认为您的 parseAKYC 函数中可能存在优化问题，但为了让您继续前进，您可以通过以下方式并行化。这是用于 ubuntu 设置。对于 Windows，您可以查看 doSNOW 包。

使用以下包：

require(doParallel)
require(foreach)
require(doMC) #for ubuntu
require(doSNOW) #for windows

registerDoMC(cores = 4) #set value based on the number of cores  
                        #on your machine
#assuming file_path is a character vector of complete file paths.
parsed.output <- foreach(i=1:length(files_path)) %dopar%  parseAKYC(files_path[i])

【讨论】：

我添加了 parseAKYC 函数。实现了并行代码，但抛出错误“找不到函数“as.String””
在函数“extract_People_Location_Org”中，您使用的是 as.String 函数。第三行：s
这是 NLP 中的一个函数。我转换为 as.character。它不允许许多函数，如 lapply(file, removePunctuation)、unlist(file)。我不能删除那些。你能推荐一些用于 NLP 并行处理的好包吗？提前致谢。你帮了大忙。
我认为您需要更改代码/设计函数，以便能够运行整个 parseAKYC，对每个文件使用 for 循环，然后用 foreach 替换 for。此外， lapply 和 unlist 不起作用很奇怪......可能是输入不是预期的。你需要检查一下。
当我不应用任何并行函数时，这段代码运行良好。所以这不是问题。看到另一个帖子，其中 NLP 有自己的并行处理版本。感谢您的帮助。 :)