【问题标题】:Retrieving Dynamic Webpage Content PyQt5检索动态网页内容 PyQt5
【发布时间】:2020-09-25 09:14:17
【问题描述】:

能够登录到受保护的网站后,我想抓取正在动态加载的同一网页的一些内容。此代码块正确处理身份验证,但如果我尝试访问类名为 lang-py 的预标记元素,我会得到 None 作为输出返回给我。

import sys

from PyQt5.QtCore import QByteArray, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineCore import QWebEngineHttpRequest
from PyQt5.QtWebEngineWidgets import QWebEnginePage


class Render(QWebEnginePage):
    def __init__(self, url):
        app = QApplication(sys.argv)
        QWebEnginePage.__init__(self)
        self.loadFinished.connect(self._loadFinished)

        self._html = ""

        username = "username"
        password = "password"
        base64string = QByteArray(("%s:%s" % (username, password)).encode()).toBase64()
        request = QWebEngineHttpRequest(QUrl.fromUserInput(url))
        equest.setHeader(b"Authorization", b"Basic: %s" % (base64string,))

        self.load(request)

        app.exec_()

    @property
    def html(self):
        return self._html

    def _loadFinished(self):
        self.toHtml(self.handle_to_html)

    def handle_to_html(self, html):
        self._html = html
        QApplication.quit()


def main():
    url = "https://stackoverflow.com/questions/64055445/scraping-websites-with-protected-content-using-pyqt5/64055601?noredirect=1#comment113272437_64055601"
    r = Render(url)
    print(r.html)


if __name__ == "__main__":
    main()

如何加载<pre> 中的内容?

【问题讨论】:

    标签: python pyqt5


    【解决方案1】:

    带有标签“pre”和类“lang-py”的元素存在于html中,因此您可以使用BeautifulSoup获取数据:

    # ...
    from bs4 import BeautifulSoup
    
    # ...
    
    def main():
        url = "https://stackoverflow.com/questions/64055445/scraping-websites-with-protected-content-using-pyqt5/64055601?noredirect=1#comment113272437_64055601"
        r = Render(url)
        soup = BeautifulSoup(r.html, "html.parser")
        for tag in soup.find_all("pre", {"class": "lang-py"}):
            print("=" * 50)
            print(tag.text)
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2013-07-01
      • 1970-01-01
      • 2019-01-13
      • 2012-10-07
      • 2019-02-20
      • 2017-09-19
      • 2020-05-24
      • 2010-11-16
      相关资源
      最近更新 更多