【问题标题】:Wrapper value extraction using MSXML2.XMLHTTP使用 MSXML2.XMLHTTP 提取包装器值
【发布时间】:2021-05-25 01:16:49
【问题描述】:

我们目前正在使用 MSXML2.XMLHTTP 从网页中提取数据。使用我的代码已提取除 rvw-cnt-tx 类数据之外的所有数据。我想从以下 url 中提取 43 评论值。

url="https://www.trendyol.com/lc-waikiki/erkek-cocuk-lacivert-takim-p-78215759?boutiqueId=555784&merchantId=4171"

网页html:

<a href="/lc-waikiki/erkek-cocuk-lacivert-takim-p-78215759/yorumlar?boutiqueId=555784&merchantId=4171&v=11-12-yas" class="rvw-cnt-tx">43 Reviews </a>

我的代码

Set http = CreateObject("MSXML2.XMLHTTP")                                   
http.Open "GET", url, False                                                 
http.Send                                                                   
html.body.innerHTML = http.ResponseText                                     
html1 = html.body.innerHTML                                                 
brand = html.body.innerText                                                 
Dim reviews As String                                                       
cat = html.getElementsByClassName("breadcrumb full-width")(0).innerText     
reviews = html.getElementsByClassName("rvw-cnt-tx")(0).innerText            

【问题讨论】:

    标签: javascript html vba web-scraping data-extraction


    【解决方案1】:

    它是动态检索的。但是,您可以将/yorumlar 连接到当前网址的末尾以进入评论页面,并且该值静态存在。我使用正则表达式来提取存在评论数量的文本的数字部分。

    这个html.querySelector(".title h3") 将限制正则表达式只从存在该值的节点中搜索字符串。

    Option Explicit
    
    Public Sub GetReviewCount()
        'tools > references > Microsoft HTML Object Library
        Dim re As Object, html As MSHTML.HTMLDocument,  xhr As Object
    
        Set re = CreateObject("VBScript.RegExp")
        Set xhr = CreateObject("MSXML2.XMLHTTP")
        Set html = New MSHTML.HTMLDocument
        re.Pattern = "([0-9,]+)"
        
        With xhr
            .Open "GET", "https://www.trendyol.com/lc-waikiki/erkek-cocuk-lacivert-takim-p-78215759/yorumlar", False
            .setRequestHeader "User-Agent", "Mozilla/5.0"
            .send
            html.body.innerhtml = .responseText
        End With
        Debug.Print re.Execute(html.querySelector(".title h3").innerText)(0).SubMatches(0)
    End Sub
    

    正确获取 cat 变量:

    Option Explicit
    
    Public Sub GetCat()
        'tools > references > Microsoft HTML Object Library
        Dim html As MSHTML.HTMLDocument, xhr As Object
    
        Set xhr = CreateObject("MSXML2.XMLHTTP")
        Set html = New MSHTML.HTMLDocument
    
        With xhr
            .Open "GET", "https://www.trendyol.com/lc-waikiki/erkek-cocuk-lacivert-takim-p-78215759?boutiqueId=555784&merchantId=4171", False
            .setRequestHeader "User-Agent", "Mozilla/5.0"
            .send
            html.body.innerhtml = .responseText
        End With
        
        Dim nodes As Object, cat As String, i As Long
        
        Set nodes = html.querySelectorAll(".breadcrumb .breadcrumb-item")
        For i = 0 To nodes.Length - 1
            cat = cat & IIf(i = nodes.Length - 1, nodes.Item(i).innerText, nodes.Item(i).innerText & " > ")
        Next
        Debug.Print cat
    End Sub
    

    【讨论】:

    • 谢谢,但是计数也可以在页面中间有以下类 pr-rnr-sm-p-s
    • html 关注
      42 条评论
      29 cmets
    • 嗨,你想要评论的数量。我返回了正确数量的评论。我专门选择了我所做的源字符串,因为它是一个较短的字符串,因此搜索效率更高。
    • 感谢您的澄清,我正在使用以下代码进行评分提取,但代码未显示十进制值,即仅显示 4 而不是 4.6
    • rating = re.Execute(html.getElementsByClassName("pr-rnr-sm-p")(0).innerText)(0)
    猜你喜欢
    • 1970-01-01
    • 2019-12-10
    • 1970-01-01
    • 1970-01-01
    • 1970-01-01
    • 2017-04-22
    • 2017-12-25
    • 2021-12-20
    • 1970-01-01
    相关资源
    最近更新 更多