【问题标题】:Scraping innerHTML from sites using VBA使用 VBA 从网站中抓取 innerHTML
【发布时间】:2017-05-29 09:20:36
【问题描述】:

我正在尝试声明一个节点数组(这不是问题),然后在数组的每个元素中抓取两个子节点的 innerHTML - 以 SE 为例(使用 @987654322 @object 方法),假设我试图在主页上抓取标题和提取问题,有一个节点数组(类名:“question-summary”)。

然后有两个子节点(磁贴 - 类名:“question-hyperlink”和提取物 - 类名:“excerpt”)我的代码使用如下:

Sub Scraper()
Dim ie As Object
Dim doc As Object, oQuestionShells As Object, oQuestionTitle As Object, oQuestion As Object, oElement As Object
Dim QuestionShell As String, QuestionTitle As String, Question As String, sURL As String

Set ie = CreateObject("internetexplorer.application")
sURL = "https://stackoverflow.com/questions/tagged/excel-formula"

QuestionShell = "question-summary"
QuestionTitle = "question-hyperlink"
Question = "excerpt"

With ie
    .Visible = False
    .Navigate sURL
End With

Set doc = ie.Document 'Stepping through so doc is getting assigned (READY_STATE = 4)

Set oQuestionShells = doc.getElementsByClassName(QuestionShell)

For Each oElement In oQuestionShells
    Set oQuestionTitle = oElement.getElementByClassName(QuestionTitle) 'Assigning this object causes an "Object doesn't support this property or method"
    Set oQuestion = oElement.getElementByClassName(Question) 'Assigning this object causes an "Object doesn't support this property or method"
    Debug.Print oQuestionTitle.innerHTML
    Debug.Print oQuestion.innerHTML
Next

End Sub

【问题讨论】:

    标签: html vba excel web-scraping


    【解决方案1】:

    getElementByClassName 不是一种方法。

    您只能使用返回IHTMLElementCollectiongetElementsByClassName(注意方法名称中的复数形式)。

    使用Object 代替IHTMLElementCollection 很好 - 但您仍然需要通过提供索引来访问集合中的特定元素。

    假设对于每个oElement,只有一个question-summary 类的实例和一个question-hyperlink 类的实例。然后你可以只使用getElementsByClassName 并在最后使用(0) 来拉出返回的数组的第一个元素。

    所以你的代码更正是:

    Set oQuestionTitle = oElement.getElementsByClassName(QuestionTitle)(0)
    Set oQuestion = oElement.getElementsByClassName(Question)(0)
    

    完整的工作代码(有一些更新,即使用Option Explicit 并等待页面加载):

    Option Explicit
    
    Sub Scraper()
    
        Dim ie As Object
        Dim doc As Object, oQuestionShells As Object, oQuestionTitle As Object, oQuestion As Object, oElement As Object
        Dim QuestionShell As String, QuestionTitle As String, Question As String, sURL As String
    
        Set ie = CreateObject("internetexplorer.application")
        sURL = "https://stackoverflow.com/questions/tagged/excel-formula"
    
        QuestionShell = "question-summary"
        QuestionTitle = "question-hyperlink"
        Question = "excerpt"
    
        With ie
            .Visible = True
            .Navigate sURL
            Do
                DoEvents
            Loop While .ReadyState < 4 Or .Busy
        End With
    
        Set doc = ie.Document
    
        Set oQuestionShells = doc.getElementsByClassName(QuestionShell)
    
        For Each oElement In oQuestionShells
            'Debug.Print TypeName(oElement)
    
            Set oQuestionTitle = oElement.getElementsByClassName(QuestionTitle)(0)
            Set oQuestion = oElement.getElementsByClassName(Question)(0)
    
            Debug.Print oQuestionTitle.innerHTML
            Debug.Print oQuestion.innerHTML
        Next
    
        ie.Quit
    
    End Sub
    

    【讨论】:

      猜你喜欢
      • 2018-11-21
      • 2015-01-19
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2021-10-02
      • 1970-01-01
      相关资源
      最近更新 更多