【发布时间】:2022-01-13 10:09:15
【问题描述】:
以下代码通过抓取 ECHA 网站检索 A 列中物质的“档案网址”。我正在尝试错误处理无法找到物质 URL 的情况。
我不太明白为什么下面的代码会失败。我用评论突出显示了有问题的行。这在调试中突出显示为需要对象错误,但我看不出哪里出错了。
Sub PopulateExposures()
Dim url, rw As Range
Set rw = Sheets("data").Range("A2:E2") 'first row with inputs
Do While Application.CountA(rw) > 0
url = SubstanceUrl(rw.Cells(1).Value, rw.Cells(2).Value) 'get the URL
rw.Cells(5).Resize(1, 3).Value = ExposureData(url) 'get exposure data (as array) and add to row
Set rw = rw.Offset(1, 0) 'next substance
Loop
End Sub
Public Function SubstanceUrl(SubstanceName, CASNumber) As String
Const url = "https://echa.europa.eu/information-on-chemicals/registered-substances?" & _
"p_p_id=dissregisteredsubstances_WAR_dissregsubsportlet&p_p_lifecycle=1&" & _
"p_p_state=normal&p_p_mode=view&" & _
"__dissregisteredsubstances_WAR_dissregsubsportlet_javax.portlet.action=dissRegisteredSubstancesAction"
Dim oHTML, oHttp, MyDict, payload, DictKey, sep
Set oHTML = New HTMLDocument
Set oHttp = CreateObject("MSXML2.XMLHTTP")
Set MyDict = CreateObject("Scripting.Dictionary")
MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_name") = SubstanceName
MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_cas-number") = CASNumber
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true"
MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on"
payload = ""
For Each DictKey In MyDict
payload = payload & sep & DictKey & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey))
sep = "&"
Next DictKey
With oHttp
.Open "POST", url, False
.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36"
.setRequestHeader "Content-type", "application/x-www-form-urlencoded"
.send payload
oHTML.body.innerHTML = .responseText
End With
'PROBLEMATIC CODE
If oHTML.querySelector(".details").getAttribute("href") Is Error Then
SubstanceUrl = "-"
Else
'Sometimes output changes despite same input
SubstanceUrl = oHTML.querySelector(".details").getAttribute("href")
End If
Debug.Print SubstanceUrl
End Function
Function ExposureData(urlToGet)
Dim XMLReq As New MSXML2.XMLHTTP60
Dim HTMLDoc As HTMLDocument, dds
Dim Route(1 To 3) As String, Results(1 To 3) As String, c, Info, Data
Route(1) = "sGeneralPopulationHazardViaInhalationRoute"
Route(2) = "sGeneralPopulationHazardViaDermalRoute"
Route(3) = "sGeneralPopulationHazardViaOralRoute"
XMLReq.Open "Get", urlToGet & "/7/1", False
XMLReq.send
If XMLReq.Status <> 200 Then
Results(1) = "Problem" & vbNewLine & XMLReq.Status & " - " & XMLReq.statusText
Else
Set HTMLDoc = New HTMLDocument
HTMLDoc.body.innerHTML = XMLReq.responseText
For c = 1 To UBound(Route, 1)
Set Info = HTMLDoc.getElementById(Route(c))
If Not Info Is Nothing Then
Set Info = Info.NextSibling.NextSibling.NextSibling
Set dds = Info.getElementsByTagName("dd")
If dds.Length > 1 Then
Results(c) = dds(1).innerText
Else
Results(c) = "hazard unknown"
End If
Else
Results(c) = "no info"
End If
Next c
End If
ExposureData = Results
End Function
要运行此代码,A 列中必须存在值。丙酮和苯可分别用于测试 2 行。测试错误处理输入像 Benzenjaj 这样的东西。
我认为这是一个快速修复。就是看不出来。
更新:
前 2 个结果正常,但合成的化学物质导致以下错误:
-
代码:
Sub PopulateExposures() Dim url, rw As Range
Set rw = Sheets("data").Range("A2:E2") 'first row with inputs Do While Application.CountA(rw) > 0 url = SubstanceUrl(rw.Cells(1).Value, rw.Cells(2).Value) 'get the URL rw.Cells(5).Resize(1, 3).Value = ExposureData(url) 'get exposure data (as array) and add to row Set rw = rw.Offset(1, 0) 'next substance Loop结束子
公共函数 SubstanceUrl(SubstanceName, CASNumber) As String
Const url = "https://echa.europa.eu/information-on-chemicals/registered-substances?" & _ "p_p_id=dissregisteredsubstances_WAR_dissregsubsportlet&p_p_lifecycle=1&" & _ "p_p_state=normal&p_p_mode=view&" & _ "__dissregisteredsubstances_WAR_dissregsubsportlet_javax.portlet.action=dissRegisteredSubstancesAction" Dim oHTML, oHttp, MyDict, payload, DictKey, sep Set oHTML = New HTMLDocument Set oHttp = CreateObject("MSXML2.XMLHTTP") Set MyDict = CreateObject("Scripting.Dictionary") MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_name") = SubstanceName MyDict("_dissregisteredsubstances_WAR_dissregsubsportlet_disreg_cas-number") = CASNumber MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimer") = "true" MyDict("_disssimplesearchhomepage_WAR_disssearchportlet_disclaimerCheckbox") = "on" payload = "" For Each DictKey In MyDict payload = payload & sep & DictKey & "=" & WorksheetFunction.EncodeURL(MyDict(DictKey)) sep = "&" Next DictKey With oHttp .Open "POST", url, False .setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36" .setRequestHeader "Content-type", "application/x-www-form-urlencoded" .send payload oHTML.body.innerHTML = .responseText End With On Error Resume Next 'ignore error on following line SubstanceUrl = oHTML.querySelector(".details").getAttribute("href") On Error GoTo 0 'stop ignoring errors If Len(SubstanceUrl) = 0 Then SubstanceUrl = "<no URL>"结束函数
函数曝光数据(urlToGet)
Dim XMLReq As New MSXML2.XMLHTTP60 Dim HTMLDoc As HTMLDocument, dds Dim Route(1 To 3) As String, Results(1 To 3) As String, c, Info, Data Route(1) = "sGeneralPopulationHazardViaInhalationRoute" Route(2) = "sGeneralPopulationHazardViaDermalRoute" Route(3) = "sGeneralPopulationHazardViaOralRoute" XMLReq.Open "Get", urlToGet & "/7/1", False XMLReq.send If XMLReq.Status <> 200 Then Results(1) = "Problem" & vbNewLine & XMLReq.Status & " - " & XMLReq.statusText Else Set HTMLDoc = New HTMLDocument HTMLDoc.body.innerHTML = XMLReq.responseText For c = 1 To UBound(Route, 1) Set Info = HTMLDoc.getElementById(Route(c)) If Not Info Is Nothing Then Set Info = Info.NextSibling.NextSibling.NextSibling Set dds = Info.getElementsByTagName("dd") If dds.Length > 1 Then Results(c) = dds(1).innerText Else Results(c) = "hazard unknown" End If Else Results(c) = "no info" End If Next c End If ExposureData = Results结束函数
【问题讨论】:
-
Is运算符比较 2 个对象。 getAttribute 方法返回一个不是 Object 的 String。 -
如何将此字符串的输出设置为对象以检查 URL 是否存在?
-
@BrianMStafford 我不明白底部模块中的
Info是一个字符串,但Is运算符似乎可以工作? -
Info不是字符串。它是一个包含 Object 的 Variant,如Set Info =行所示。您只需要对象的 Set 关键字。此外,最好将 Info 定义为正确的类型。类似于Dim Info As IHTMLElement。
标签: vba web-scraping error-handling