如果您浏览网页https://www.rosterresource.com/mlb-arizona-diamondbacks 并从表格的上下文菜单中选择检查元素,您将在浏览器开发人员工具中看到整个表格位于一个框架内:
<iframe id="pageswitcher-content" frameborder="0" marginheight="0" marginwidth="0" src="https://docs.google.com/spreadsheets/d/e/2PACX-1vSe6YBd7UW_ijhVHdRsM132Z3aUXUIzGuHcoqqdsr_nUXIYHbvRDFY0XCwGVndXJnWRaWVYhbeDbo5W/pubhtml/sheet?headers=false&gid=1569103012" style="display: block; width: 100%; height: 100%;"></iframe>
所以实际上您需要从该 Google 电子表格文档中检索数据。这可以通过 XHR 和 Regex 来完成,如下代码所示:
Option Explicit
Sub Test()
Dim sContent As String
Dim i As Long
Dim j As Long
Dim k As Long
Dim aTables()
Dim aHeader() As String
Dim aRows() As String
' Retrieve HTML content via XHR
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", "https://www.rosterresource.com/mlb-arizona-diamondbacks", False
.Send
sContent = .ResponseText
End With
' Cut all before iframe URL
sContent = Split(sContent, "<iframe src=""", 2)(1)
' Cut all after ? sign within URL
sContent = Split(sContent, "?", 2)(0)
' Download google spreadsheet by extracted URL
' e. g. https://docs.google.com/spreadsheets/d/e/2PACX-1vQngsjnOpqkD8FQIOLn4cFayZTe4dl5VJZLNjMzji2Iq0dVXan7nj20Pq6oKnVS_HFla9e5GUtCyYl_/pubhtml
' e. g. https://docs.google.com/spreadsheets/d/e/2PACX-1vSe6YBd7UW_ijhVHdRsM132Z3aUXUIzGuHcoqqdsr_nUXIYHbvRDFY0XCwGVndXJnWRaWVYhbeDbo5W/pubhtml
' Retrieve HTML content via XHR
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", sContent, False
.Send
sContent = .ResponseText
End With
' Parse with RegEx
With CreateObject("VBScript.RegExp")
.Global = True
.MultiLine = True
.IgnoreCase = True
' Process all tables within iframe content
.Pattern = "<table\b[\s\S]*?>([\s\S]*?)</table>"
With .Execute(sContent)
ReDim aTables(0 To .Count - 1)
For i = 0 To .Count - 1
aTables(i) = .Item(i).SubMatches(0)
Next
End With
For k = 0 To UBound(aTables)
' Minor HTML simplification
sContent = aTables(k)
' Remove all tags except table formatting
.Pattern = "<(?!/td|/tr|/th|(?:td|tr|th)\b)[^>]*>|\r|\n|\t"
sContent = .Replace(sContent, "")
' Remove tags attributes
.Pattern = "<(\w+)\b[^>]+>"
sContent = .Replace(sContent, "<$1>")
' Replace th with td
.Pattern = "<(/?)th>"
sContent = .Replace(sContent, "<$1td>")
' Replace HTML entities &name; &#number; with chars
.Pattern = "&(?:\w+|#\d+);"
.Global = False
Do
With .Execute(sContent)
If .Count = 0 Then Exit Do
sContent = Replace(sContent, .Item(0), DecodeHTMLEntities(.Item(0)))
End With
Loop
.Global = True
' Extract rows
.Pattern = "<tr>((?:<td>.*?</td>)+)</tr>"
With .Execute(sContent)
ReDim aRows(0 To .Count - 1, 0)
For i = 0 To .Count - 1
aRows(i, 0) = .Item(i).SubMatches(0)
Next
End With
' Extract cells
.Pattern = "<td>(.*?)</td>"
For i = 0 To UBound(aRows, 1)
With .Execute(aRows(i, 0))
For j = 0 To .Count - 1
If UBound(aRows, 2) < j Then ReDim Preserve aRows(UBound(aRows, 1), j)
aRows(i, j) = Trim(.Item(j).SubMatches(0))
DoEvents
Next
End With
Next
aTables(k) = aRows
Next
End With
' Output
With ThisWorkbook
' Remove all existing worksheets
Application.DisplayAlerts = False
.Sheets.Add , .Sheets(.Sheets.Count)
Do While .Sheets.Count > 1
.Sheets(1).Delete
Loop
Application.DisplayAlerts = True
' Output each table to separate worksheet
For k = 0 To UBound(aTables)
If .Sheets.Count < (k + 1) Then .Sheets.Add , .Sheets(.Sheets.Count)
With .Sheets(k + 1)
.Cells.Delete
Output2DArray .Cells(1, 1), aTables(k)
.Columns.AutoFit
End With
Next
End With
End Sub
Function DecodeHTMLEntities(sText As String) As String
Static oHtmlfile As Object
Static oDiv As Object
If oHtmlfile Is Nothing Then
Set oHtmlfile = CreateObject("htmlfile")
oHtmlfile.Open
Set oDiv = oHtmlfile.createElement("div")
End If
oDiv.innerHTML = sText
DecodeHTMLEntities = oDiv.innerText
End Function
Sub Output2DArray(oDstRng As Range, aCells As Variant)
With oDstRng
.Parent.Select
With .Resize( _
UBound(aCells, 1) - LBound(aCells, 1) + 1, _
UBound(aCells, 2) - LBound(aCells, 2) + 1)
.NumberFormat = "@"
.Value = aCells
End With
End With
End Sub
通常不建议将 RegEx 用于 HTML 解析,因此 there is disclaimer。在这种情况下处理的数据非常简单,这就是使用 RegEx 对其进行解析的原因。关于正则表达式:introduction(尤其是syntax)、introduction JS、VB flavor。简化使 HTML 代码在某种程度上适合解析。 BTW there is one more answer 使用相同的方法。