【问题标题】:R function is looping over the same data in webscraperR函数在webscraper中循环相同的数据
【发布时间】:2021-06-19 18:01:29
【问题描述】:

这是我写的程序

    library(rvest)
    library(RCurl)
    library(XML)
    library(stringr)


    #Getting the number of Page
    getPageNumber <- function(URL){
      parsedDocument = read_html(URL)
      Sort1 <- html_nodes(parsedDocument, 'div')
      Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pageNumbers al-pageNumbers")] 
      P <- str_count(html_text(Sort2), pattern = " \\d+\r\n")
      return(ifelse(length(P) == 0, 0, max(P)))
    }


    #Getting all articles based off of their DOI
    getAllArticles <-function(URL){
      parsedDocument = read_html(URL)
      Sort1 <- html_nodes(parsedDocument,'div')
      Sort2 <-  Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
      ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(Sort2)))
      URL3 <- "https://doi.org/10.1093/dnares/"
      URL4 <- paste(URL3, ArticleDOInumber, sep = "")
      return(URL4)
    }


    Title <- function(parsedDocument){
      Sort1 <- html_nodes(parsedDocument, 'h1')
      Title <- gsub("<h1>\\n|\\n</h1>","",Sort1)
      return(Title)
    }


    #main function with input as parameter year
    findURL <- function(year_chosen){
      if(year_chosen >= 1994){
      noYearURL = glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
      pagesURl = "&fl_SiteID=5275&startpage="
      URL = paste(noYearURL, pagesURl, sep = "")
      #URL is working with parameter year_chosen
      Page <- getPageNumber(URL)
      

      Page2 <- 0
      while(Page < Page2 | Page != Page2){
        Page <- Page2
        URL3 <- paste(URL, Page-1, sep = "")
        Page2 <- getPageNumber(URL3)    
      }
      R_Data <- data.frame()
      for(i in 1:Page){ #0:Page-1
        URL2 <- getAllArticles(paste(URL, i, sep = ""))
        for(j in 1:(length(URL2))){
          parsedDocument <- read_html(URL2[j])
          print(URL2[j])
          R <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
          #R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
          R_Data <- rbind(R_Data, R)
        } 
      }
      paste(URL2)
      suppressWarnings(write.csv(R_Data, "DNAresearch.csv", row.names = FALSE, sep = "\t"))
      #return(R_Data)
      } else {
        print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
      }
    }

    findURL(2003)

我的代码输出如下:

[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"
[1] "https://doi.org/10.1093/dnares/10.6.277"
[1] "https://doi.org/10.1093/dnares/10.6.229"
[1] "https://doi.org/10.1093/dnares/10.6.239"
[1] "https://doi.org/10.1093/dnares/10.6.287"
[1] "https://doi.org/10.1093/dnares/10.5.221"
[1] "https://doi.org/10.1093/dnares/10.5.203"
[1] "https://doi.org/10.1093/dnares/10.5.213"
[1] "https://doi.org/10.1093/dnares/10.4.137"
[1] "https://doi.org/10.1093/dnares/10.4.147"
[1] "https://doi.org/10.1093/dnares/10.4.167"
[1] "https://doi.org/10.1093/dnares/10.4.181"
[1] "https://doi.org/10.1093/dnares/10.4.155"
[1] "https://doi.org/10.1093/dnares/10.3.115"
[1] "https://doi.org/10.1093/dnares/10.3.85"
[1] "https://doi.org/10.1093/dnares/10.3.123"
[1] "https://doi.org/10.1093/dnares/10.3.129"
[1] "https://doi.org/10.1093/dnares/10.3.97"
[1] "https://doi.org/10.1093/dnares/10.2.59"
[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"

我正在尝试以年份作为参数来抓取期刊。我已经刮掉了一页,但是当我应该更改页面时,我的循环只是回到页面顶部并循环遍历相同的数据。我的代码应该是正确的,但我不明白为什么会这样。提前谢谢你

【问题讨论】:

    标签: r loops for-loop web-scraping iterator


    【解决方案1】:

    不是读取同一个url。这是您选择了错误的节点,而该节点恰好会产生重复信息。正如我在你上一个问题中提到的,你需要重新设计你的 Title 函数。下面的Title 重写会根据类名和单节点匹配提取实际的文章标题。

    请注意删除您的 sep arg。代码的其他一些区域看起来可能可以在逻辑方面进行简化。


    标题功能:

    Title <- function(parsedDocument) {
      Title <- parsedDocument %>%
        html_node(".article-title-main") %>%
        html_text() %>%
        gsub("\\r\\n\\s+", "", .) %>%
        trimws(.)
      return(Title)
    }
    

    R:

    library(rvest)
    library(XML)
    library(stringr)
    
    
    # Getting the number of Page
    getPageNumber <- function(URL) {
      # print(URL)
      parsedDocument <- read_html(URL)
      Sort1 <- html_nodes(parsedDocument, "div")
      Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pagination al-pagination")]
      P <- str_count(html_text(Sort2), pattern = " \\d+\r\n")
      return(ifelse(length(P) == 0, 0, max(P)))
    }
    
    # Getting all articles based off of their DOI
    getAllArticles <- function(URL) {
      print(URL)
      parsedDocument <- read_html(URL)
      Sort1 <- html_nodes(parsedDocument, "div")
      Sort2 <- Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
      ArticleDOInumber <- trimws(gsub(".*10.1093/dnares/", "", html_text(Sort2)))
      URL3 <- "https://doi.org/10.1093/dnares/"
      URL4 <- paste(URL3, ArticleDOInumber, sep = "")
      return(URL4)
    }
    
    
    Title <- function(parsedDocument) {
      Title <- parsedDocument %>%
        html_node(".article-title-main") %>%
        html_text() %>%
        gsub("\\r\\n\\s+", "", .) %>%
        trimws(.)
      return(Title)
    }
    
    
    # main function with input as parameter year
    findURL <- function(year_chosen) {
      if (year_chosen >= 1994) {
        noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
        pagesURl <- "&fl_SiteID=5275&page="
        URL <- paste(noYearURL, pagesURl, sep = "")
        # URL is working with parameter year_chosen
        Page <- getPageNumber(URL)
    
    
        if (Page == 5) {
          Page2 <- 0
          while (Page < Page2 | Page != Page2) {
            Page <- Page2
            URL3 <- paste(URL, Page - 1, sep = "")
            Page2 <- getPageNumber(URL3)
          }
        }
        R_Data <- data.frame()
        for (i in 1:Page) {
          URL2 <- getAllArticles(paste(URL, i, sep = ""))
          for (j in 1:(length(URL2))) {
            parsedDocument <- read_html(URL2[j])
            #print(URL2[j])
            #print(Title(parsedDocument))
            R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
            #print(R)
            R_Data <- rbind(R_Data, R)
          }
        }
        write.csv(R_Data, "Group4.csv", row.names = FALSE)
      } else {
        print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
      }
    }
    
    findURL(2003)
    

    【讨论】:

    • 是的,谢谢你的清晰解释我明白你的标题功能是什么意思了!最佳
    • 我还有一个问题,这将是关于使用与 Title 类似的函数来获取文章的整个文本。该函数将被称为全文。唯一的问题是,在期刊中,文章全文都在 pdf 文件中,所以我认为它不能被刮掉。谢谢大家的帮助。
    • stackoverflow.com/questions/38592600/how-to-read-pdf-file-in-r 所以你只需要提取 pdf 链接并从该包中传递给函数。
    • 它将获得全文的每个链接。我知道在 R 中有一个读取 pdf 的功能,但是从期刊中我相信没有办法获得需要抓取的 pdf 的链接。
    • 那么你在哪里可以找到它们?你提到了pdf文件。
    猜你喜欢
    • 2021-11-07
    • 2020-03-28
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2019-09-15
    • 2021-09-20
    相关资源
    最近更新 更多