【问题标题】:Sub and Function Work Independently but not togetherSub 和 Function 独立工作但不能一起工作
【发布时间】:2021-08-05 14:56:35
【问题描述】:

这个问题是我发布的一个小系列的一部分,尝试和网络抓取brief profiles of https://echa.europa.eu/information-on-chemicals

代码使用公共函数GetUrl() 来检索所需简要资料的网址。然后使用 SubRoutine GetContents() 来抓取所需的物理和化学属性数据。

令人费解的是,我收到了一个运行时错误 91。这很奇怪,因为 GetContents() 和 GetUrl() 在相​​互独立时都可以工作。

有人不介意看看那会很棒吗。

        Sub GetContents()

Dim xmlReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
            
Dim SubSectList As MSHTML.IHTMLElement
Dim SubSects As MSHTML.IHTMLElementCollection
Dim SubSect As MSHTML.IHTMLElement
                    
Url = GetUrl()
                    
xmlReq.Open "Get", Url, False
xmlReq.send
            
           
If xmlReq.Status <> 200 Then
            
MsgBox "Problem" & vbNewLine & xmlReq.Status & " - " & xmlReq.statusText
Exit Sub

End If
            
HTMLDoc.body.innerHTML = xmlReq.responseText
            
Set SubSectList = HTMLDoc.getElementsByClassName("col-xs-12 col-lg-10 MainContent")(1)
            
Set SubSects = SubSectList.getElementsByTagName("dt")
              
For Each SubSect In SubSects
Debug.Print SubSect.innerText & " : "; SubSect.NextSibling.innerText
Next SubSect

End Sub


Public Function GetUrl() As String
        Const Url = "https://echa.europa.eu/search-for-chemicals?p_auth=5ayUnMyz&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
        Dim oHttp As Object, oHtml As HTMLDocument, MyDict As Object, I&, R&
        Dim DictKey As Variant, payload$, searchKeyword$, Ws As Worksheet
        
        Set oHtml = New HTMLDocument
        Set oHttp = CreateObject("MSXML2.XMLHTTP")
        Set MyDict = CreateObject("Scripting.Dictionary")
        Set Ws = ThisWorkbook.Worksheets("Sheet1")
    
'Keyword can Be any chemical usually set to a cell value i.e. Range("a1").Value
        searchKeyword = "Acetone" '
        
        MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_formDate") = "1621017052777" 'timestamp
        MyDict("_disssimplesearch_WAR_disssearchportlet_searchOccurred") = "true"
        MyDict("_disssimplesearch_WAR_disssearchportlet_sskeywordKey") = searchKeyword
        MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
        MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
    
        payload = ""
        For Each DictKey In MyDict
            payload = IIf(Len(DictKey) = 0, WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)), _
            payload & "&" & WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)))
        Next DictKey
        
        With oHttp
            .Open "POST", Url, False
            .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
            .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
            .send (payload)
            oHtml.body.innerHTML = .responseText
        End With
        
        Debug.Print oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
        GetUrl = oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
    
    End Function

参考资料:

更新:特别奇怪的是,当在代码中给出确切的 url 时,以下代码可以工作:

Sub GetContents()

Dim xmlReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
            
Dim SubSectList As MSHTML.IHTMLElement
Dim SubSects As MSHTML.IHTMLElementCollection
Dim SubSect As MSHTML.IHTMLElement
                    
xmlReq.Open "Get", "https://echa.europa.eu/brief-profile/-/briefprofile/100.028.723", False
xmlReq.send
            
            
If xmlReq.Status <> 200 Then
            
MsgBox "Problem" & vbNewLine & xmlReq.Status & " - " & xmlReq.statusText
Exit Sub

End If
            
HTMLDoc.body.innerHTML = xmlReq.responseText
            
Set SubSectList = HTMLDoc.getElementsByClassName("col-xs-12 col-lg-10 MainContent")(1)
            
Set SubSects = SubSectList.getElementsByTagName("dt")

            
For Each SubSect In SubSects
Debug.Print SubSect.innerText & " : "; SubSect.NextSibling.innerText
Next SubSect

End Sub

然而替换

xmlReq.Open "Get", "https://echa.europa.eu/brief-profile/-/briefprofile/100.028.723", False
xmlReq.send

Url = GetUrl()
xmlReq.Open "Get", url, False
    xmlReq.send

其中 GetUrl() 指的是上面的工作公共函数

导致代码中断.. 调试时将Set SubSects = SubSectList.getElementsByTagName("dt") 作为有问题的行。

运行答案中提供的代码时更新结果的屏幕截图:

【问题讨论】:

  • “运行时错误 91” - 这到底发生在哪里?
  • 调试突出显示Set SubSects = SubSectList.getElementsByTagName("dt")作为有问题的行
  • 我怀疑这是问题所在,但payload = IIf(Len(DictKey) = 0 应该是payload = IIf(Len(payload) = 0
  • 您的试验表明您的函数 GetURL() 中存在数据类型错误。也许它会返回一个变体,它应该返回一个字符串 - 类似的东西。

标签: excel vba web-scraping xmlhttprequest


【解决方案1】:

您提取了错误的 url,并且该 URI 的 html 中没有 dt 元素。更改css选择器并简化如下:

Option Explicit

Public Sub GetContents()
    Dim XMLReq As New MSXML2.XMLHTTP60
    Dim HTMLDoc As New MSHTML.HTMLDocument
        
    XMLReq.Open "Get", GetUrl, False
    XMLReq.send
                  
    HTMLDoc.body.innerHTML = XMLReq.responseText
    
    Dim i As Long
    
    With HTMLDoc.querySelectorAll(".EndpointContent dt")
        For i = 0 To .Length - 1
            Debug.Print .Item(i).innerText & " : " & .Item(i).NextSibling.NextSibling.innerText
        Next
    End With
End Sub

Public Function GetUrl() As String
    Const url = "https://echa.europa.eu/search-for-chemicals?p_auth=5ayUnMyz&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
    Dim oHttp As Object, oHtml As HTMLDocument, MyDict As Object, i&, R&
    Dim DictKey As Variant, payload$, searchKeyword$, Ws As Worksheet
        
    Set oHtml = New HTMLDocument
    Set oHttp = CreateObject("MSXML2.XMLHTTP")
    Set MyDict = CreateObject("Scripting.Dictionary")
    Set Ws = ThisWorkbook.Worksheets("Sheet1")
    
    'Keyword can Be any chemical usually set to a cell value i.e. Range("a1").Value
    searchKeyword = "Acetone"                    '
        
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_formDate") = "1621017052777" 'timestamp
    MyDict("_disssimplesearch_WAR_disssearchportlet_searchOccurred") = "true"
    MyDict("_disssimplesearch_WAR_disssearchportlet_sskeywordKey") = searchKeyword
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
    MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
    
    payload = ""
        
    For Each DictKey In MyDict
        payload = IIf(Len(DictKey) = 0, WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)), _
                      payload & "&" & WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)))
    Next DictKey
        
    With oHttp
        .Open "POST", url, False
        .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
        .setRequestHeader "Content-type", "application/x-www-form-urlencoded"
        .send (payload)
        oHtml.body.innerHTML = .responseText
    End With
        
    Debug.Print oHtml.querySelector(".briefProfileLink").getAttribute("href")
    GetUrl = oHtml.querySelector(".briefProfileLink").getAttribute("href")
    
End Function

【讨论】:

  • 我的代码,正如所写的那样,对我来说运行没有错误。你拉错了 url,因为你使用了错误的 css 选择器。
  • 嗯,我不明白,将您的答案复制到新模块中仍然会导致此错误。也许我的电脑?感谢您的帮助!
  • 错误信息是什么,在哪一行?
  • 我用屏幕截图替换了更新。 url 被打印,然后它开始根据需要打印属性,但在C Odour: C Substance Type 之后我得到Run-time error '91': Object Variable or with block variable not set。使用 debig 将突出显示的行显示为问题。
  • 刚刚在另一台电脑上测试了它,它工作正常。奇怪
猜你喜欢
  • 2020-01-20
  • 1970-01-01
  • 1970-01-01
  • 2011-07-08
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 1970-01-01
  • 2016-05-18
相关资源
最近更新 更多