【发布时间】:2019-08-16 15:42:07
【问题描述】:
这是一段应该连接到网页的代码,内容如下:link1, description1, otherdata1, link2, description2, otherdata2, ..., linkN, descriptionN, otherdataN 其中 N 是 30 000 +。
从这些链接中,程序使用正则表达式找到一个感兴趣的链接,转到该链接,并从那里下载一个文件。
我的问题是:在 htmlWebInterfaceXML.send 程序经常但并非总是会耗尽内存('out-of-memory' 错误)。我很难测试不同的解决方案,因为通常程序运行平稳,如果有变化也很难注意到。
附加信息:
- 它在某些 PC 上运行流畅,在其他 PC 上运行不流畅
- 它通常运行到下午然后抛出错误
其他可能有用的信息:
- 显示的代码是类的私有方法,类本身是源代码的一小部分
- 其他我不解释的调用的subs不相关并且运行顺利,problem总是出现在htmlWebInterfaceXML.send。
我的一个猜测是,我在一个函数内部声明了局部变量,该函数包含一个非常大的对象并可能导致堆栈溢出,但似乎不太可能,因为 VBA 应该自己处理这些事情。 也许您发现了一个我没有发现的问题?谢谢。
Private Sub FileUpload()
' THE FUNCTION CANNOT BE CONNECTING FOR EACH CONTRACT ID! WILL TAKE TOO MUCH TIME - NEED TO ALTER
Dim member As Variant
Dim byteCounter As Byte
Dim byteMaxID As Byte
Dim strPathToXMLFile As String
Dim strURLToXMLFile
Dim strXMLFileStorageName As String
Dim domdocXMLText As New MSXML2.DOMDocument
Dim clctStrFoundMatches As New Collection
Dim clctInternalIDs As New Collection
Dim vrntContractID As Variant
Dim htmlHTMLMainPageXMLInterface As New MSHTML.HTMLDocument
Dim htmlTagElement As Variant
Dim htmclctFoundXMLs As MSHTML.IHTMLElementCollection
Dim htmlWebInterfaceXML As MSXML2.XMLHTTP60
Dim intNumberOfTradeUnderProcessing As Integer
UpdateProgressStatus "LOADING SERVER WITH SOURCE XML..." '<----------- UPDATE PROGRESS!
'----------------------> OPEN AND LOAD THE WEB SERVER, AND STORE ITS HTML INTO AN OBJECT
Set htmlWebInterfaceXML = New MSXML2.XMLHTTP
With htmlWebInterfaceXML
.Open "GET", p_cstrWebInterfaceXMLRootDirectory, False
.setRequestHeader "Authorization", "Basic" & Base64Encode( _
p_cstrXMLWebInterfaceAuthenticationUser & ":" & p_cstrXMLWebInterfaceAuthenticationPassword)
.setRequestHeader "Pragma", "no-cache"
.setRequestHeader "Cache-Control", "no-cache"
.setRequestHeader "If-Modified-Since", "Sat, 1 Jan 2000 00:00:00 GMT"
.send
End With
htmlHTMLMainPageXMLInterface.body.innerHTML = htmlWebInterfaceXML.responseText ' how much text is the htmldocument able to store??
Set htmlWebInterfaceXML = Nothing
SetUpDirectory ' ------------> create or set directory where to store XML files
If Me.ContractiDs.Count <> Me.MailParts.Count And Me.ContractiDs.Count <> Me.MailParts.Count * 2 Then
Err.Raise 1504, "FileUpload", p_cstrError1504Message
Else
For Each vrntContractID In Me.ContractiDs
intNumberOfTradeUnderProcessing = intNumberOfTradeUnderProcessing + 1
UpdateProgressStatus "LOADING XML FOR THE TRADE NUMBER " & intNumberOfTradeUnderProcessing & "..." ' ----------------> UPDATE STATUS BAR
' ------------------------> find the tags containing the needed contract id in their names
Set htmclctFoundXMLs = htmlHTMLMainPageXMLInterface.getElementsByTagName("a")
Set clctStrFoundMatches = New Collection
For Each htmlTagElement In htmclctFoundXMLs
If htmlTagElement.getAttribute("href") Like "*" & vrntContractID & "*" Then
clctStrFoundMatches.Add htmlTagElement
End If
Next htmlTagElement
If clctStrFoundMatches.Count = 0 Then Err.Raise 1506, "FileUpload", p_cstrError1506Message
' -----------------------> exclude the archives from the collection
byteCounter = 0
For byteCounter = 1 To clctStrFoundMatches.Count
If blnContainsPattern("\.gz$", clctStrFoundMatches(byteCounter).innerText) Then
clctStrFoundMatches.Remove byteCounter
End If
Next byteCounter
' ----------------------> extract the contract ids and find the last contract id
Set clctInternalIDs = New Collection
For Each member In clctStrFoundMatches
clctInternalIDs.Add strReturnSingleMatch("\d{9}", member.innerText)
If clctInternalIDs(clctInternalIDs.Count) = "False" Then Err.Raise 1505, "FileUpload", p_cstrError1505Message
Next member
byteMaxID = FindMaximum(clctInternalIDs)
strPathToXMLFile = clctStrFoundMatches(byteMaxID).innerText
' -----------------------> check whether such file exists, and, if not, download it
If blnFileExists(strPathToXMLFile, p_cstrXMLDestination) Then
Else
strURLToXMLFile = p_cstrWebInterfaceXMLRootDirectory & strPathToXMLFile
Set htmlWebInterfaceXML = Nothing: Set htmlWebInterfaceXML = New MSXML2.XMLHTTP
htmlWebInterfaceXML.Open "GET", strURLToXMLFile, False
htmlWebInterfaceXML.setRequestHeader "Authorization", "Basic" & Base64Encode( _
p_cstrXMLWebInterfaceAuthenticationUser & ":" & p_cstrXMLWebIntervaceAuthenticationPassword)
htmlWebInterfaceXML.send
With domdocXMLText
.validateOnParse = False
.async = False
End With
domdocXMLText.LoadXML htmlWebInterfaceXML.responseText
domdocXMLText.Save p_cstrXMLDestination & "\" & strPathToXMLFile
End If
Next vrntContractID
End If
Set htmlHTMLMainPageXMLInterface = Nothing
End Sub
【问题讨论】:
-
4) 您通常可以使用更高效的 CSS 选择器(即 querySelectorAll)移除像 If htmlTagElement.getAttribute("href") Like "*" & vrntContractID & "*" 这样的循环("href*=" & vrntContractID) .... 回到第 2 点)如果您是通过 html 进行正则表达式或只是返回字符串,则实际上甚至无法从上面确定,但估计仍然有更好的方法来提取源 HTML具体链接。
-
@QHarr 嘿,感谢您的回复。我通过 html 进行正则表达式,因为它的标签很差。特别是,对我来说至关重要的链接的日期和时间根本没有包含在标签中,而是“挂在空间中”。未显示的子程序在几毫秒内表现出色,它们所做的根本不相关(实际上,显示的代码可能是整个源代码的 0.5%)。问题总是出现在 htmlWebInterfaceXML.send。感谢您提供宝贵的cmets
-
@QHarr 谢谢!实际上这个 sub 是一个类的私有方法。简而言之,这个类解析一个电子邮件,用它的内容做了一堆东西(不幸的是,也用正则表达式),还上传了必要的文件,如这个 sub 所示。这是否从表单运行,取决于用户行为:用户拖放电子邮件,在这种情况下触发事件,或者还从用户表单中的按钮单击启动对类的调用序列.此类本身是在 main() 子中实例化的。希望这是描述性的,是的,网页是一场噩梦
-
@QHarr 指的是您的最后一条评论:从 IE 加载该页面也需要很长时间,我真的不知道为什么,30 000 + 一定不是 HTML 那么多?如果我们能解决“内存不足”问题,我会很高兴
-
@QHarr 再次感谢您,我会检查所有链接并在一段时间内提供更新。
标签: vba performance web-scraping xmlhttprequest out-of-memory