【发布时间】:2021-08-05 14:56:35
【问题描述】:
这个问题是我发布的一个小系列的一部分,尝试和网络抓取brief profiles of https://echa.europa.eu/information-on-chemicals
代码使用公共函数GetUrl() 来检索所需简要资料的网址。然后使用 SubRoutine GetContents() 来抓取所需的物理和化学属性数据。
令人费解的是,我收到了一个运行时错误 91。这很奇怪,因为 GetContents() 和 GetUrl() 在相互独立时都可以工作。
有人不介意看看那会很棒吗。
Sub GetContents()
Dim xmlReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim SubSectList As MSHTML.IHTMLElement
Dim SubSects As MSHTML.IHTMLElementCollection
Dim SubSect As MSHTML.IHTMLElement
Url = GetUrl()
xmlReq.Open "Get", Url, False
xmlReq.send
If xmlReq.Status <> 200 Then
MsgBox "Problem" & vbNewLine & xmlReq.Status & " - " & xmlReq.statusText
Exit Sub
End If
HTMLDoc.body.innerHTML = xmlReq.responseText
Set SubSectList = HTMLDoc.getElementsByClassName("col-xs-12 col-lg-10 MainContent")(1)
Set SubSects = SubSectList.getElementsByTagName("dt")
For Each SubSect In SubSects
Debug.Print SubSect.innerText & " : "; SubSect.NextSibling.innerText
Next SubSect
End Sub
Public Function GetUrl() As String
Const Url = "https://echa.europa.eu/search-for-chemicals?p_auth=5ayUnMyz&p_p_id=disssimplesearch_WAR_disssearchportlet&p_p_lifecycle=1&p_p_state=normal&p_p_col_id=_118_INSTANCE_UFgbrDo05Elj__column-1&p_p_col_count=1&_disssimplesearch_WAR_disssearchportlet_javax.portlet.action=doSearchAction&_disssimplesearch_WAR_disssearchportlet_backURL=https%3A%2F%2Fecha.europa.eu%2Finformation-on-chemicals%3Fp_p_id%3Ddisssimplesearchhomepage_WAR_disssearchportlet%26p_p_lifecycle%3D0%26p_p_state%3Dnormal%26p_p_mode%3Dview%26p_p_col_id%3D_118_INSTANCE_UFgbrDo05Elj__column-1%26p_p_col_count%3D1%26_disssimplesearchhomepage_WAR_disssearchportlet_sessionCriteriaId%3D"
Dim oHttp As Object, oHtml As HTMLDocument, MyDict As Object, I&, R&
Dim DictKey As Variant, payload$, searchKeyword$, Ws As Worksheet
Set oHtml = New HTMLDocument
Set oHttp = CreateObject("MSXML2.XMLHTTP")
Set MyDict = CreateObject("Scripting.Dictionary")
Set Ws = ThisWorkbook.Worksheets("Sheet1")
'Keyword can Be any chemical usually set to a cell value i.e. Range("a1").Value
searchKeyword = "Acetone" '
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_formDate") = "1621017052777" 'timestamp
MyDict("_disssimplesearch_WAR_disssearchportlet_searchOccurred") = "true"
MyDict("_disssimplesearch_WAR_disssearchportlet_sskeywordKey") = searchKeyword
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
payload = ""
For Each DictKey In MyDict
payload = IIf(Len(DictKey) = 0, WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)), _
payload & "&" & WorksheetFunction.EncodeURL(DictKey) & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)))
Next DictKey
With oHttp
.Open "POST", Url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send (payload)
oHtml.body.innerHTML = .responseText
End With
Debug.Print oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
GetUrl = oHtml.querySelector("table.table > tbody > tr > td > a.substanceNameLink").getAttribute("href")
End Function
参考资料:
更新:特别奇怪的是,当在代码中给出确切的 url 时,以下代码可以工作:
Sub GetContents()
Dim xmlReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As New MSHTML.HTMLDocument
Dim SubSectList As MSHTML.IHTMLElement
Dim SubSects As MSHTML.IHTMLElementCollection
Dim SubSect As MSHTML.IHTMLElement
xmlReq.Open "Get", "https://echa.europa.eu/brief-profile/-/briefprofile/100.028.723", False
xmlReq.send
If xmlReq.Status <> 200 Then
MsgBox "Problem" & vbNewLine & xmlReq.Status & " - " & xmlReq.statusText
Exit Sub
End If
HTMLDoc.body.innerHTML = xmlReq.responseText
Set SubSectList = HTMLDoc.getElementsByClassName("col-xs-12 col-lg-10 MainContent")(1)
Set SubSects = SubSectList.getElementsByTagName("dt")
For Each SubSect In SubSects
Debug.Print SubSect.innerText & " : "; SubSect.NextSibling.innerText
Next SubSect
End Sub
然而替换
xmlReq.Open "Get", "https://echa.europa.eu/brief-profile/-/briefprofile/100.028.723", False
xmlReq.send
与
Url = GetUrl()
xmlReq.Open "Get", url, False
xmlReq.send
其中 GetUrl() 指的是上面的工作公共函数
导致代码中断.. 调试时将Set SubSects = SubSectList.getElementsByTagName("dt") 作为有问题的行。
【问题讨论】:
-
“运行时错误 91” - 这到底发生在哪里?
-
调试突出显示
Set SubSects = SubSectList.getElementsByTagName("dt")作为有问题的行 -
我怀疑这是问题所在,但
payload = IIf(Len(DictKey) = 0应该是payload = IIf(Len(payload) = 0 -
您的试验表明您的函数 GetURL() 中存在数据类型错误。也许它会返回一个变体,它应该返回一个字符串 - 类似的东西。
标签: excel vba web-scraping xmlhttprequest