使用 R, 'rvest' 包抓取在线报纸数据答案

【问题标题】：Online newspaper data scraping with R, 'rvest' package使用 R, 'rvest' 包抓取在线报纸数据
【发布时间】：2021-12-31 11:59:06
【问题描述】：

我的课程任务是从新闻媒体中抓取数据并进行分析。这是我第一次使用 R 进行抓取，我在获取数据、检查各种指南方面陷入了数周的困境，所有这些都以有限的输出或错误告终。

首先，我尝试了Analyticsvidhya 的指南，这是我获得的最清晰的代码。我开始时只从报纸的档案中抓取一页：

library('rvest')
library('xml2')
library(dplyr)

url <- 'https://en.trend.az/archive/2021-11-03'
library("rvest")
html <- read_html(url)
headline_html <- html_nodes(html,'.category-article .article-title')
#144 articles according by (c)SelectorGadget
headline <- html_text(headline_html)
#print(headline)
length(headline)

我已经为其他 CSS 选择器尝试过类似的代码，但我无法获得超过 9 个结果。

我认为问题可能出在 URL 上，因此决定从存档中涵盖几天的一组子页面中抓取。

这是根据an answer in the StackOverflow的代码

all_df <- list()
arch_date <- seq(as.Date("2021-11-03"), as.Date("2021-11-13"), by="days")

for(i in 'rchdate'){

  url_fonq <- str_c ('https://en.trend.az', "/archive/", arch_date)
  webpage_fonq <- read_html(url_fonq)
  head(webpage_fonq)

  headline_html <- html_nodes(webpage_fonq,'.category-article .article-title')
  headline <- html_text(headline_html)
  head(headline)
  headline <- str_trim(headline)
  head(headline)
  length(headline)

...（此处省略其他节点的类似命令）

  fonq.df <- data.frame( Num = row_number,
                    Date = date,
                    Time = time,
                    Title = headline,
                    Category = cat)

  all_df <-bind_rows(all_df, fonq.df)
}

这是一个我无法修复的错误：

错误：x 必须是长度为 1 的字符串 7. stop("x必须是长度为1的字符串", call. = FALSE) 6. read_xml.character(x, encoding = encoding, ..., as_html = TRUE, options = options) 5. read_xml(x, encoding = encoding, ..., as_html = TRUE, options = options) 4. withCallingHandlers(expr, warning = function(w) if (inherits(w, classes)) tryInvokeRestart("muffleWarning")) 3. suppressWarnings(read_xml(x, encoding = encoding, ..., as_html = TRUE, options = options)) 2. read_html.default(url_fonq)

read_html(url_fonq)

在我尝试从the DataCamp 为初学者提供更详细但模棱两可的指南之前，确实出现了一个未解决的错误。

url <- 'https://en.trend.az/archive/2021-11-03'
headline_html <- read_html(url)  

get_headline <- function(html){
      html %>% 
        # The relevant tag
        html_nodes('.category-article .article-title') %>%       
        html_text() %>%   
        # Trim additional white space - important function
        str_trim() %>%                        
        # Convert the list into a vector
        unlist()                             
}

...（此处省略其他节点的类似命令）

get_data_table <- function(html, company_name){
        headline <- get_headline(html)
        time <- get_time(html)
        
        combine_data <- tibble(Abstract = headline,
                               Date = time
                               )
        combined_data %>%
          mutate(Trend.AZ = company_name) %>% 
        select(Trend.AZ, Abstract, Date)
}

get_data_from_url <- function(url, company_name){
      html <- read_html(url)
      get_data_table(html, company_name)
}

scrape_write_table <- function(url, company_name){
  
      url <- "https://en.trend.az"
      arch_date <- seq(as.Date("2021-10-01"), as.Date("2021-11-01"), by="days")
      list_of_url <- str_c (url, "/archive/", arch_date)  

      list_of_url %>% 
        map(get_data_from_url, company_name) %>%  
        bind_rows() %>% 
        write_tsv(str_c(company_name,'.tsv'))     
}

scrape_write_table(url, 'Trend.AZ') 
# !!!The error was after here!!!

trend_az_tbl <- read_tsv('Trend.AZ')
    tail(amz_tbl, 11)

错误：

html_elements(...) 中的错误：找不到对象“tmp” 15. html_elements(...) 14. html_nodes(., ".category-article .article-date") 13.*tmp*%>% html_nodes(".category-article.article-date") 12.get_time(html) 11. get_data_table(html, company_name) 10. .f(.x[[i]], ...) 9. 地图(., get_data_from_url, company_name) 8. 列表 2(...) 7. 绑定行（。） 6. is.data.frame(x) 5. stopifnot(is.data.frame(x)) 4. write_delim(x, file, delim = "\t", na = na, append = append, col_names = col_names, quote = quote, escape = escape, eol = eol, num_threads = num_threads，进度 = 进度） 3. write_tsv(., str_c(company_name, ".tsv")) 2. list_of_url %>% map(get_data_from_url, company_name) %>% bind_rows() %>% write_tsv(str_c(company_name, ".tsv"))

scrape_write_table(url, "Trend.AZ")

如果您对这 3 个代码中的任何一个提出任何意见或建议，我将不胜感激。我真的很急于转移到项目的分析部分，以便能够在课程结束时生成报告。

【问题讨论】：

你想只得到文章标题吗？
您没有指定要抓取的内容。
@NadPat 我抓取了所有可用信息：标题、日期和时间、类别，每个都使用不同的 CSS 选择器，但每次我使用函数 print(headline) 或 print(time) 时，结果的最大数量我得到的是 [9]。可能只是终端窗口的限制，实际上我刮掉了整个页面？
@Bloxx 一个月内每条新闻（标题、日期、时间、类别）的所有可用信息。我通过空格只留下了headline的例子。这个想法是稍后应用模式分析并按时间绘制。

标签： r web-scraping rvest

【解决方案1】：

网页是动态加载的，向下滚动时会加载新文章。因此您需要RSelenium 和rvest 来提取所需的数据。

启动浏览器

library(rvest)
library(RSelenium)
url = 'https://en.trend.az/archive/2021-11-02'
driver = rsDriver(browser = c("firefox"))
remDr <- driver[["client"]]
remDr$navigate(url)
#click outside in an empty space
remDr$findElement(using = "xpath", value = '/html/body/div[1]/div/div[1]/h1')$clickElement()

webElem <- remDr$findElement("css", "body")
#scrolling to the end of webpage, to load all articles 
for (i in 1:17){
  Sys.sleep(2)
  webElem$sendKeysToElement(list(key = "end"))
}

获取文章标题

remDr$getPageSource()[[1]] %>% 
  read_html() %>%
html_nodes('.category-article') %>% html_nodes('.article-title') %>% 
  html_text()
[1] "Chelsea defeats Malmö with minimum score"                                                                                                 
 [2] "Iran’s import of COVID-19 vaccine exceeds 146mn doses: IRICA"                                                                             
 [3] "Sadyr Zhaparov, Fumio Kishida discuss topical issues of Kyrgyz-Japanese relations"                                                        
 [4] "We will definitely see new names at World Championships and World Age Group Competitions in Trampoline Gymnastics in Baku - Farid Gayibov"
 [5] "Declaration on forest protection, land use adopted by 105 countries"                                                                      
 [6] "Russian Security Council's chief, CIA director meet in Moscow"                                                                            
 [7] "Israel to exhibit for 1st time at Dubai Airshow"                                                                                          
 [8] "Azerbaijan's General Prosecutor's Office continues to take measures on appeal against Armenia"                                            
 [9] "Azerbaijani, Russian FMs discuss activity of working group for restoration of communications in South Caucasus"                           
[10] "Russia holds tenth meeting of joint Azerbaijani-Russian Demarcation Commission"                                                           
[11] "Only external reasons cause inflation in Azerbaijan - Gazprombank"                                                                        
[12] "State Oil Fund of Azerbaijan launches tender for technical vendor support"

获取文章链接

lin = remDr$getPageSource()[[1]] %>% 
  read_html() %>% html_nodes('.category-news-wrapper') %>% html_nodes('.article-link')

获取文章类别、日期和时间

remDr$getPageSource()[[1]] %>% 
  read_html() %>%  
  html_nodes('.category-article') %>% html_nodes('.article-meta') %>% 
  html_text()
 [1] "\n                Other News\n                2 November 23:55\n            "
 [2] "\n                Society\n                2 November 23:14\n            "   
 [3] "\n                Kyrgyzstan\n                2 November 22:55\n            "
 [4] "\n                Society\n                2 November 22:51\n            "   
 [5] "\n                Other News\n                2 November 22:26\n            "
 [6] "\n                Russia\n                2 November 21:50\n            "    
 [7] "\n                Israel\n                2 November 21:24\n            "    
 [8] "\n                Politics\n                2 November 20:50\n            "  
 [9] "\n                Politics\n                2 November 20:25\n            "  
[10] "\n                Politics\n                2 November 20:16\n            "

【讨论】：

抱歉更新晚了，但您的回复非常有帮助。只要我没有降级 Firefox（版本 93.0（64 位）），它一开始就不起作用。还以另一种方式组合抓取的数据，因为我没有得到选择器'.category-article .article-meta'all_df <- list() trend_az.df <- data.frame( Number = c(get_number), Date = c(get_date), Time = c(get_time), Category = c(get_cat), Title = c(get_headline), Link = c(get_links)) all_df <- list(trend_az.df) print(trend_az.df)