使用 BeautifulSoup 和 json 进行网页抓取答案

【问题标题】：Webscraping with BeautifulSoup and json使用 BeautifulSoup 和 json 进行网页抓取
【发布时间】：2026-01-18 17:10:01
【问题描述】：

我正在尝试构建一个 webscraper，它将提取有关加密货币价格的历史数据，但是当我尝试打印出数据时，输出只是没有读取。这是代码：

             #Libraries
             import requests
             from bs4 import BeautifulSoup
             import json
             import time
             import pandas as pd

             coins = {}

             cm = requests.get('https://coinmarketcap.com/')
             soup = BeautifulSoup(cm.content, 'html.parser')

             data = soup.find('script', 
                    id="__NEXT_DATA__",type="application/json") 

             coin_data = json.loads(data.contents[0])
             listings = coin_data['props']['initialState'] 
                        ['cryptocurrency']['listingLatest'] 
                        ['data']

             for i in listings:
               coins[str(i['id'])] = i['slug']

             for i in coins:
               page = 
               requests.get(f'https://coinmarketcap.com/
               currencies/{coins[i]}/historical-data/?2021
               0101&20210627')


             soup = BeautifulSoup(page.content, 'html.parser')
             data = soup.find('script', 
                    id="__NEXT_DATA__",type="application/json")
                    hitorical_data = json.loads(data.contents[0])


             print(data.cardano)

【问题讨论】：

标签： python json web-scraping beautifulsoup

【解决方案1】：

如果您在浏览器中查看该页面，并在查看历史数据的同时记录浏览器的网络流量，您将看到一个 HTTP GET 请求被发送到一个服务于 JSON 的 REST API，其中包含您可能想要的所有信息。您所要做的就是模仿该请求 - 不需要 BeautifulSoup 或 Pandas：

def get_historical_data(currency_id, start, end):
    import requests

    url = "https://api.coinmarketcap.com/data-api/v3/cryptocurrency/historical"

    params = {
        "id": currency_id,
        "convertId": "2781", # seems to be USD
        "timeStart": start,
        "timeEnd": end

    }

    headers = {
        "accept": "application/json",
        "accept-encoding": "gzip, deflate",
        "user-agent": "Mozilla/5.0"
    }

    response = requests.get(url, params=params, headers=headers)
    response.raise_for_status()

    for quote in response.json()["data"]["quotes"]:
        yield quote["timeClose"], quote["quote"]["close"]

def main():

    from datetime import datetime

    start = str(round(datetime(2021, 1, 1).timestamp()))
    end = str(round(datetime.now().timestamp()))

    currency_ids = {
        "BTC": "1"
    }

    for time_close, close in get_historical_data(currency_ids["BTC"], start, end):
        print("[{}]: {}".format(time_close, close))

    return 0


if __name__ == "__main__":
    import sys
    sys.exit(main())

输出：

[2021-01-02T23:59:59.999Z]: 32127.27
[2021-01-03T23:59:59.999Z]: 32782.02
[2021-01-04T23:59:59.999Z]: 31971.91
[2021-01-05T23:59:59.999Z]: 33992.43
[2021-01-06T23:59:59.999Z]: 36824.36
[2021-01-07T23:59:59.999Z]: 39371.04
[2021-01-08T23:59:59.999Z]: 40797.61
[2021-01-09T23:59:59.999Z]: 40254.55
[2021-01-10T23:59:59.999Z]: 38356.44
[2021-01-11T23:59:59.999Z]: 35566.66
[2021-01-12T23:59:59.999Z]: 33922.96
...

【讨论】：

【解决方案2】：

from bs4 import BeautifulSoup
import httpx
import trio
import json

mainurl = "https://coinmarketcap.com/"


async def main():
    async with httpx.AsyncClient(timeout=None) as client:
        r = await client.get(mainurl)
        soup = BeautifulSoup(r.text, 'lxml')
        data = json.loads(soup.select_one('#__NEXT_DATA__').string)[
            'props']['initialState']['cryptocurrency']['listingLatest']['data']

        coins = {k['id']: k['slug'] for k in data}

        for coin in coins.values():
            r = await client.get(f'https://coinmarketcap.com/currencies/{coin}/historical-data/?20210101&20210627')
            soup = BeautifulSoup(r.text, 'lxml')
            data = json.loads(soup.select_one('#__NEXT_DATA__').string)
            print(data)

if __name__ == "__main__":
    trio.run(main)

【讨论】：