【问题标题】:R: Scraping the hover text title of every cell in a table using rvestR:使用 rvest 抓取表格中每个单元格的悬停文本标题
【发布时间】:2015-11-08 13:19:49
【问题描述】:

我正在使用 rvest 从一些 javascript 表中抓取数据,例如此处的表。 https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd

如您所见,在此表中,每个单元格都有一个值,并且当您将鼠标悬停在上方时,还有另一个附加值。

我没有问题像这样使用 rvest 刮桌子:

tips <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd") %>%
   html_table(header=TRUE)

但我不确定如何抓取悬停值。我可以用 rvest 做到这一点吗?

【问题讨论】:

    标签: r


    【解决方案1】:

    您可以按如下方式扩展/修改rvest:::html_table.xml_node 方法:
    见 R cmets

    my_html_table <- function(x, header = NA, trim = TRUE, fill = FALSE, dec = ".", attr_name = ""){
      rows <- html_nodes(x, "tr")
      n <- length(rows)
      cells <- lapply(rows, "html_nodes", xpath = ".//td|.//th")
      ncols <- lapply(cells, html_attr, "colspan", default = "1")
      ncols <- lapply(ncols, as.integer)
      p <- unique(vapply(ncols, sum, integer(1)))
      if (length(p) > 1) {
        if (!fill) {
          stop("Table has inconsistent number of columns. ", 
               "Do you want fill = TRUE?", call. = FALSE)
        }
        else {
          p <- max(p)
        }
      }
    ############################# 
    ## The following line is the only one that was changed
    ############################# 
      values <- lapply(cells, html_attr, attr_name)
    # insted of
    # values <- lapply(cells, html_text, trim = trim)
      out <- matrix(NA_character_, nrow = n, ncol = p)
      for (i in seq_len(n)) {
        row <- values[[i]]
        ncol <- ncols[[i]]
        col <- 1
        for (j in seq_len(p)) {
          if (j > length(row)) 
            next
          out[i, col] <- row[[j]]
          col <- col + ncol[j]
        }
      }
      if (is.na(header)) {
        header <- all(html_name(cells[[1]]) == "th")
      }
      if (header) {
        col_names <- out[1, , drop = FALSE]
        out <- out[-1, , drop = FALSE]
      } else {
        col_names <- paste0("X", seq_len(ncol(out)))
      }
      df <- lapply(seq_len(p), function(i) {
        utils::type.convert(out[, i], as.is = TRUE, dec = dec)
      })
      names(df) <- col_names
      class(df) <- "data.frame"
      attr(df, "row.names") <- .set_row_names(length(df[[1]]))
      df
    }
    

    现在您可以执行以下操作将原始数据与工具提示数据“合并”

    require(rvest)
    doc <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd")
    x <- doc %>% html_node("table")
    tips <- x %>% my_html_table(attr_name = "title")
    dat <- doc %>% html_table() %>% `[[`(1)
    
    tips[,c(1,2,28,29)] <- dat[,c(1,2,28,29)]
    tips[1:2,] <- dat[1:2,]
    

    给你

    > head(tips)
    
                             X1          X2 X3      X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 X20 X21
    1 Leaderboard Select Round:        <NA> NA Overall  1  2  3  4  5   6   7   8   9  10  11  12  13  14  15  16  17
    2                      Rank        Name NA         NA NA NA NA NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA  NA
    3                         1   Gilly2311 NA    <NA> 11 53 42 72 51  41   5  45  20  75  39   3  41  18  13   7  58
    4                         2       Harts NA    <NA> 27 50 56 57 53  28  15  32   8  63  51   1  26  23  21   6  87
    5                         3 mygypsyrose NA    <NA>  3 49 64 62 35  61   1  37  20  47  53   9  23   0   7  15  38
    6                         4 Scraggie_93 NA    <NA> 19 58 81 32 39  31  12  54  35  44  44  17  25  15   7  16  38
      X22 X23 X24 X25 X26 X27    X28   X29
    1  18  19  20  21  22  23             
    2  NA  NA  NA  NA  NA  NA Margin Score
    3  74  52  11  62  80  15    888   146
    4  46  63  18  51  79   4    865   145
    5  48  56  19  12  78  11    748   142
    6  48  53   7  21  75  11    782   142
    

    看看 tipsdat 看看他们的结果

    【讨论】:

      【解决方案2】:

      或者用更少的努力去做:

      library(rvest)
      library(dplyr)
      
      pg <- read_html("https://tipping.portadelaidefc.com.au/comp/the-alberton-crowd")
      tips <- html_table(pg, header=TRUE)[[1]]
      
      bind_rows(lapply(html_nodes(pg, "tbody > tr"), function(x) {
        cbind.data.frame(t(c(html_text(html_nodes(x, "td"))[2], 
                             html_attr(html_nodes(x, "td.tooltip"), "title"))))
      }))
      
      ##                 1     2     3     4     5     6     7     8     9    10    11    12
      ##             (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr) (chr)
      ## 1       Gilly2311    11    53    42    72    51    41     5    45    20    75    39
      ## 2           Harts    27    50    56    57    53    28    15    32     8    63    51
      ## 3     mygypsyrose     3    49    64    62    35    61     1    37    20    47    53
      ## 4     Scraggie_93    19    58    81    32    39    31    12    54    35    44    44
      ## 5         Deb1967     4    33    54    60    35    72    21    53    20   118    66
      ## 6        svolaris     6    52    45    76    50    24    19    45    19    64    58
      ## 7     dazza power    14    56    61    45    62    54     2    64    60    40    36
      ## 8  Flamingoflames    28    33    35    83    34    76     1    17     9    83    46
      ## 9    FEARTHEBEARD    27    34    47    59    50    59     6    50     5    54    38
      ## 10        Jules23    11    35    57    47    42    65    34    38     4    61    37
      ## ..            ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...
      ## Variables not shown: 13 (chr), 14 (chr), 15 (chr), 16 (chr), 17 (chr), 18 (chr), 19
      ##   (chr), 20 (chr), 21 (chr), 22 (chr), 23 (chr), 24 (chr)
      

      重命名列、更改列类型并将其cbinding 到主表也非常简单。

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2018-05-05
        • 2018-06-27
        • 1970-01-01
        • 2017-03-01
        • 2017-03-06
        • 2015-09-08
        相关资源
        最近更新 更多