按 ID 抓取 HTML [关闭]答案

【问题标题】：Web scraping HTML by ID [closed]按 ID 抓取 HTML [关闭]
【发布时间】：2018-03-13 03:19:35
【问题描述】：

我想从这个site 中删除信息

如何根据下面的图片检索 176 ...

这是我尝试过的代码：

Option Explicit

Sub HL_Sectors()

    Dim XMLPage As New MSXML2.XMLHTTP60
    Dim HTMLDoc As New MSHTML.HTMLDocument
    Dim HTMLSector As MSHTML.IHTMLElement
    Dim HTMLSectorID As MSHTML.IHTMLElement
    Dim RowNum As Long, ColNum As Integer
    Dim HTMLSectorIDVal As MSHTML.IHTMLAttributeCollection
    Dim HTMLSectorValue As MSHTML.IHTMLElement

    XMLPage.Open "GET", "http://www.hl.co.uk/funds", False
    XMLPage.send
    HTMLDoc.body.innerHTML = XMLPage.responseText
    Debug.Print HTMLDoc.getElementById("fundSearch-detail").innerText

End Sub

寻求有关将价值添加到每个相应部门的建议。请参阅下面的代码。我在检索值时遇到了困难

Option Explicit


Sub HL_Sectors()

Dim XMLPage As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim HTMLSector As MSHTML.IHTMLElement
Dim HTMLSectorID As MSHTML.IHTMLElement
Dim RowNum As Long, ColNum As Integer

XMLPage.Open "GET", "http://www.hl.co.uk/funds", False
XMLPage.send

HTMLDoc.body.innerHTML = XMLPage.responseText

Set HTMLSector = HTMLDoc.getElementById("search-sector")

Range("A:B").ClearContents

    RowNum = 1
    For Each HTMLSectorID In HTMLSector.getElementsByTagName("option")

        ColNum = 1
            Cells(RowNum, ColNum) = HTMLSectorID.getAttribute("value")
            ColNum = ColNum + 1
            Cells(RowNum, ColNum) = HTMLSectorID.innerText
        RowNum = RowNum + 1
    Next HTMLSectorID

结束子

【问题讨论】：

做一些研究？ 'getElementByID'
感谢@Nathan_Sav 我试过但失败了。我什么都没得到
发布您尝试过的内容
当然这是我正在逐步完成的部分 VBA ...

标签： vba excel web-scraping xmlhttprequest

【解决方案1】：

看看下面的例子：

Option Explicit

Sub Test()

    Dim sResponse As String
    Dim oOptions As Object
    Dim i As Long
    Dim vOption

    ' Retrieve search page web form HTML content
    XmlHttpRequest "http://www.hl.co.uk/funds/fund-discounts,-prices--and--factsheets/search-results", sResponse
    ' Extract options
    ExtractOptions sResponse, "sectorid", oOptions
    ' Prepare for output to first worksheet
    With ThisWorkbook.Sheets(1)
        .Cells.Delete
        ' Loop through each option
        i = 1
        For Each vOption In oOptions
            Do
                ' Retrieve search results for the option
                XmlHttpRequest "http://www.hl.co.uk/funds/fund-discounts,-prices--and--factsheets/search-results?sectorid=" & oOptions(vOption) & "&lo=2&filters=0%2C1%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C&page=1&tab=prices&dummy=" & Round(Rnd * 10000000000000#), sResponse
                DoEvents
            Loop Until InStr(sResponse, """totalResults"":""") > 0
            ' Extract total and output
            .Cells(i, 1) = oOptions(vOption)
            .Cells(i, 2) = vOption
            .Cells(i, 3) = Split(Split(sResponse, """totalResults"":""", 2)(1), """", 2)(0)
            .Columns.AutoFit
            i = i + 1
            DoEvents
        Next
    End With
    MsgBox "Completed"

End Sub

Sub XmlHttpRequest(sURL As String, sResp As String)

    With CreateObject("MSXML2.XMLHTTP")
        .Open "GET", sURL, True
        .send
        Do While .readyState <> 4
            DoEvents
        Loop
        sResp = .responseText
    End With

End Sub

Sub ExtractOptions(sContent As String, ByVal sName As String, oOptions As Object)

    Dim aTmp
    Dim vItem

    ' Escape RegEx special characters
    For Each vItem In Array("\", "*", "+", "?", "^", "$", ".", "[", "]", "{", "}", "(", ")", "|", "/")
        sName = Replace(sName, vItem, "\" & vItem)
    Next
    ' Extract the whole <select> for parameter
    ParseResponse "<select[^>]* name=""?" & sName & """?[^>]*>[^<]*((?:<option[^>]*>[^<]*</option>[^<]*)+)[^<]*</[^>]*>", sContent, aTmp, False
    ' Extract each parameter <option>
    ParseResponse "<option[^>]*value=(""[^""]*""|[^\s>]*)[^>]*>([^<]*)</option>", (aTmp(0)), aTmp, False
    ' Put each parameter and value into dictionary
    Set oOptions = CreateObject("Scripting.Dictionary")
    For Each vItem In aTmp
        oOptions(GetInnerText((vItem(1)))) = GetInnerText(Replace(vItem(0), """", ""))
    Next

End Sub

Sub ParseResponse(sPattern, sResponse, aData, Optional bAppend As Boolean = True, Optional bNestSubMatches = True, Optional bGlobal = True, Optional bMultiLine = True, Optional bIgnoreCase = True)

    Dim oMatch
    Dim aTmp0()
    Dim sSubMatch

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    With CreateObject("VBScript.RegExp")
        .Global = bGlobal
        .MultiLine = bMultiLine
        .IgnoreCase = bIgnoreCase
        .Pattern = sPattern
        For Each oMatch In .Execute(sResponse)
            If oMatch.SubMatches.Count = 1 Then
                PushItem aData, oMatch.SubMatches(0)
            Else
                If bNestSubMatches Then
                    aTmp0 = Array()
                    For Each sSubMatch In oMatch.SubMatches
                        PushItem aTmp0, sSubMatch
                    Next
                    PushItem aData, aTmp0
                Else
                    For Each sSubMatch In oMatch.SubMatches
                        PushItem aData, sSubMatch
                    Next
                End If
            End If
        Next
    End With

End Sub

Sub PushItem(aData, vItem, Optional bAppend As Boolean = True)

    If Not (IsArray(aData) And bAppend) Then aData = Array()
    ReDim Preserve aData(UBound(aData) + 1)
    aData(UBound(aData)) = vItem

End Sub

Function GetInnerText(sText As String) As String

    Static oHtmlfile As Object
    Static oDiv As Object

    If oHtmlfile Is Nothing Then
        Set oHtmlfile = CreateObject("htmlfile")
        oHtmlfile.Open
        Set oDiv = oHtmlfile.createElement("div")
    End If
    oDiv.innerHTML = sText
    GetInnerText = oDiv.innerText

End Function

我的输出如下：

通常不建议将 RegEx 用于 HTML 解析，因此 there is disclaimer。在这种情况下处理的数据非常简单，这就是使用 RegEx 对其进行解析的原因。关于正则表达式：introduction（尤其是syntax）、introduction JS、VB flavor。

顺便说一句，使用类似方法还有另一个答案：1、2、3、4、5、6。

【讨论】：

感谢@omegastripes，它检索到了正确的值，我如何将它添加到每个扇区的当前代码中？非常感谢
@ddd 请显示预期输出以进一步了解。
请看我添加的图片，显示我想要实现的输出 - 谢谢！
@ddd 检查更新代码
哇！感谢您在此@omegastripes 上的所有时间，非常感谢

【解决方案2】：

当您选择使用XMLHTTP 请求从网页中抓取任何信息时，您应该在source code 而不是inspected element 中查找所需的内容。问题是当你寻找 inspecting element 的东西时，你总会找到静态内容以及动态生成的内容。由于XMLHTTP 请求无法处理dynamic content，因此您无法获取所需的值。但是，在这种情况下，您最好选择 IE。这是一个使用 IE 的演示，它可以为您提供内容。

Sub Scrape_Item()
    URL$ = "http://www.hl.co.uk/funds/fund-discounts,-prices--and--factsheets/search-results?sectorid=121&lo=2&filters=0%2C1%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C%7C&page=1&tab=prices"
    Dim elem As Object

    With CreateObject("InternetExplorer.Application")
        .Visible = False
        .navigate URL
        While .Busy = True Or .ReadyState < 4: DoEvents: Wend

        Set elem = .Document.querySelector("#fundSearch-detail strong")
        [A1] = elem.innerText
    End With
End Sub

输出：

顺便说一句，这就是源代码中内容（空白）的样子：

【讨论】：

嗨@SIM 您的代码运行良好现在我在...处收到“运行时错误'-2147467259。在代码中使用 CreateObject("InternetExplorer.Application") 行。任何想法如何我可以解决这个问题吗？已尝试使用 Google 搜索，但没有任何乐趣。非常感谢您的帮助 - 谢谢