看看下面的例子:
Option Explicit
Sub Test()
Dim sResponse As String
Dim oOptions As Object
Dim i As Long
Dim vOption
' Retrieve search page web form HTML content
XmlHttpRequest "http://www.hl.co.uk/funds/fund-discounts,-prices--and--factsheets/search-results", sResponse
' Extract options
ExtractOptions sResponse, "sectorid", oOptions
' Prepare for output to first worksheet
With ThisWorkbook.Sheets(1)
.Cells.Delete
' Loop through each option
i = 1
For Each vOption In oOptions
Do
' Retrieve search results for the option
XmlHttpRequest "http://www.hl.co.uk/funds/fund-discounts,-prices--and--factsheets/search-results?sectorid=" & oOptions(vOption) & "&lo=2&filters=0%2C1%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C&page=1&tab=prices&dummy=" & Round(Rnd * 10000000000000#), sResponse
DoEvents
Loop Until InStr(sResponse, """totalResults"":""") > 0
' Extract total and output
.Cells(i, 1) = oOptions(vOption)
.Cells(i, 2) = vOption
.Cells(i, 3) = Split(Split(sResponse, """totalResults"":""", 2)(1), """", 2)(0)
.Columns.AutoFit
i = i + 1
DoEvents
Next
End With
MsgBox "Completed"
End Sub
Sub XmlHttpRequest(sURL As String, sResp As String)
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", sURL, True
.send
Do While .readyState <> 4
DoEvents
Loop
sResp = .responseText
End With
End Sub
Sub ExtractOptions(sContent As String, ByVal sName As String, oOptions As Object)
Dim aTmp
Dim vItem
' Escape RegEx special characters
For Each vItem In Array("\", "*", "+", "?", "^", "$", ".", "[", "]", "{", "}", "(", ")", "|", "/")
sName = Replace(sName, vItem, "\" & vItem)
Next
' Extract the whole <select> for parameter
ParseResponse "<select[^>]* name=""?" & sName & """?[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp, False
' Extract each parameter <option>
ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp(0)), aTmp, False
' Put each parameter and value into dictionary
Set oOptions = CreateObject("Scripting.Dictionary")
For Each vItem In aTmp
oOptions(GetInnerText((vItem(1)))) = GetInnerText(Replace(vItem(0), """", ""))
Next
End Sub
Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bNestSubMatches = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)
Dim oMatch
Dim aTmp0()
Dim sSubMatch
If Not (IsArray(aData) And bAppend) Then aData = Array()
With CreateObject("VBScript.RegExp")
.Global = bGlobal
.MultiLine = bMultiLine
.IgnoreCase = bIgnoreCase
.Pattern = sPattern
For Each oMatch In .Execute(sResponse)
If oMatch.SubMatches.Count = 1 Then
PushItem aData, oMatch.SubMatches(0)
Else
If bNestSubMatches Then
aTmp0 = Array()
For Each sSubMatch In oMatch.SubMatches
PushItem aTmp0, sSubMatch
Next
PushItem aData, aTmp0
Else
For Each sSubMatch In oMatch.SubMatches
PushItem aData, sSubMatch
Next
End If
End If
Next
End With
End Sub
Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)
If Not (IsArray(aData) And bAppend) Then aData = Array()
ReDim Preserve aData(UBound(aData) + 1)
aData(UBound(aData)) = vItem
End Sub
Function GetInnerText(sText As String) As String
Static oHtmlfile As Object
Static oDiv As Object
If oHtmlfile Is Nothing Then
Set oHtmlfile = CreateObject("htmlfile")
oHtmlfile.Open
Set oDiv = oHtmlfile.createElement("div")
End If
oDiv.innerHTML = sText
GetInnerText = oDiv.innerText
End Function
我的输出如下:
通常不建议将 RegEx 用于 HTML 解析,因此 there is disclaimer。在这种情况下处理的数据非常简单,这就是使用 RegEx 对其进行解析的原因。关于正则表达式:introduction(尤其是syntax)、introduction JS、VB flavor。
顺便说一句,使用类似方法还有另一个答案:1、2、3、4、5、6。