【问题标题】:How to get the element window.initialState use by beautifulsoup如何获取beautifulsoup使用的元素window.initialState
【发布时间】:2021-11-01 13:14:28
【问题描述】:

我需要一些建议:我必须创建一个 Python 程序来抓取网站,例如使用 javascript 编码。

我试图让 beautifulsoup 使用元素 window.initialState 但它不起作用;如何从这个 json 中获取“Title”和“longURL”。

这个元素是 url 的一部分

('window.initialState = {"kw":null,"originalKw":"","title":"Vitamins and '
 'Supplements","numRecordsFrom":"1","numRecordsTo":"24","numRecordsTotal":"5353","adobeRecords":[{"productVendor":"Swanson '
 'Premium","brandUrl":null,"productName":"NAC N-Acetyl '
 'Cysteine","productPartNumber":"SW854","productDetails":"600 mg 100 '
 'Caps","productPillSize":"O","productPotency":"600 '
 'mg","productDiscountPrice":"10.44","offerMaxQty":"99999","rating":4.8,"shippingWeight":0.225,"statusMessage":"In '
 'stock","statusId":"I","swansonItem":true,"tooLowToShow":false,"promoDetail":null,"bogo":false,"numReviews":"206","totalQuestions":null,"totalAnswers":null,"servingSize":"1 '
 'capsule                ","servingPotency":"600 mg                        '
 '","servings":"100       '
 '","retailPrice":"10.99","everyDayLowPrice":"10.99","percentDiscount":0.05,"outletmall":false,"discountable":true,"newWebItem":false,"vegan":false,"vegetarian":false,"kosher":false,"glutenFree":false,"organicUSDA":false,"nonGMO":false,"payPalExcluded":false,"easyRefillAllowed":true,"brandRefresh":false,"longURL":"swanson-premium-nac-n-acetyl-cysteine-600-mg-100-caps","masterGroupItem":"","flagMap":{"shoprunner":"shoprunner","highdemand":"highdemand","customerfavorite":"customerfavorite","canadanodisc":"canadanodisc"},"itemCategoriesDTO":{"breadCrumbsDTOList":[{"breadCrumbsList":[{"name":"Health '
 'Concerns","seoURL":"/view/health-concerns","searchTerm":false},{"name":"Respiratory","seoURL":"/view/respiratory","searchTerm":false},{"name":"NAC","seoURL":"/view/nac","searchTerm":false}],"topLevelKeyword":"NAC","bottomLevelKeyword":"Health '
 'Concerns","singleItem":false}]},"otherSizes":null,"numExtraXlImages":null,"reviewList":null,"suggestedAdobeRecord":null,"suggestedAdobeRecordString":null,"supFactsTemplate":false,"sfHtml":null,"catalogItem":null,"numListReviewsPages":0,"productDescriptionList":null,"bulletsList":null,"discount":0.05,"srEligible":true,"statusUnavailable":false},{"productVendor":"Swanson '
 'Premium","brandUrl":null,"productName":"Vitamin D3 - Highest '
 'Potency","productPartNumber":"SW1371","productDetails":"5,000 IU (125 mcg) '
 '250 Sgels","productPillSize":"D","productPotency":"5,000 IU (125 '
 'mcg)","productDiscountPrice":"8.95","offerMaxQty":"99999","rating":4.9,"shippingWeight":0.247,"statusMessage":"In '
 'stock","statusId":"I","swansonItem":true,"tooLowToShow":false,"promoDetail":null,"bogo":false,"numReviews":"633","totalQuestions":null,"totalAnswers":null,"servingSize":"1 '
 'softgel                ","servingPotency":"5,000 IU (125 mcg)            '
 '","servings":"250       '
 '","retailPrice":"11.19","everyDayLowPrice":"11.19","percentDiscount":0.2,"outletmall":false,"discountable":true,"newWebItem":false,"vegan":false,"vegetarian":false,"kosher":false,"glutenFree":false,"organicUSDA":false,"nonGMO":false,"payPalExcluded":false,"easyRefillAllowed":true,"brandRefresh":false,"longURL":"swanson-premium-highest-potency-vitamin-d-3-5000-iu-5000-iu-250-sgels","masterGroupItem":"","flagMap":{"highdemand":"highdemand","shoprunner":"shoprunner","customerfavorite":"customerfavorite"},"itemCategoriesDTO":{"breadCrumbsDTOList":[{"breadCrumbsList":[{"name":"Health '
 'Concerns","seoURL":"/view/health-concerns","searchTerm":false},{"name":"Bone '
 'Health","seoURL":"/view/bone-health","searchTerm":false},{"name":"Vitamin '
 'D","seoURL":"/view/vitamin-d","searchTerm":false},{"name":"Vitamin '
 'D3","seoURL":"/view/vitamin-d3","searchTerm":false}],"topLevelKeyword":"Vitamin+D3","bottomLevelKeyword":"Health '
 'Concerns","singleItem":false}]},"otherSizes":null,"numExtraXlImages":null,"reviewList":null,"suggestedAdobeRecord":null,"suggestedAdobeRecordString":null,"supFactsTemplate":false,"sfHtml":null,"catalogItem":null,"numListReviewsPages":0,"productDescriptionList":null,"bulletsList":null,"discount":0.2,"srEligible":true,"statusUnavailable":false},{"productVendor":"Swanson '
 'Premium","brandUrl":null,"productName":"Quercetin - High '
 'Potency","productPartNumber":"SW1671","productDetails":"475 mg 60 Veg '
 'Caps","productPillSize":"OO","productPotency":"475 '
 'mg","productDiscountPrice":"10.63","offerMaxQty":"3","rating":4.7,"shippingWeight":0.158,"statusMessage":"In '
 'stock","statusId":"I","swansonItem":true,"tooLowToShow":false,"promoDetail":null,"bogo":false,"numReviews":"95","totalQuestions":null,"totalAnswers":null,"servingSize":"1 '
 'capsule                ","servingPotency":"475 mg                        '
 '","servings":"60        '
 '","retailPrice":"13.29","everyDayLowPrice":"13.29","percentDiscount":0.2,"outletmall":false,"discountable":true,"newWebItem":false,"vegan":false,"vegetarian":true,"kosher":false,"glutenFree":false,"organicUSDA":false,"nonGMO":false,"payPalExcluded":false,"easyRefillAllowed":true,"brandRefresh":false,"longURL":"swanson-premium-high-potency-quercetin-475-mg-60-veg-caps","masterGr
page = requests.get(url)
soup = bsoup(page.text, 'html.parser')

script = soup.find_all('script')
pprint(script)

所以,我尝试使用

data = re.search(r'window.initialState = {.*}', html_data)

但它无法转换为 json

【问题讨论】:

    标签: python html beautifulsoup


    【解决方案1】:

    我已将您的数据设为HTML,您可以使用.string 从脚本标签中查找文本数据

    html="""<script>window.initialState = {"kw":null,"originalKw":"","title":"Vitamins and Supplements"}</script>"""
    soup=bs(html,"html.parser")
    data=soup.find("script").string
    

    现在使用re 模块使用表达式查找合适的数据

    import re
    data=re.findall("{.*?}",data)[0]
    

    它会返回字符串对象刚刚加载到json它会返回键值对对象数据

    import json
    main_data=json.loads(data)
    

    输出:

    {'kw': None, 'originalKw': '', 'title': 'Vitamins and Supplements'}
    

    【讨论】:

    猜你喜欢
    • 1970-01-01
    • 2021-04-08
    • 2022-06-23
    • 2021-09-09
    • 1970-01-01
    • 2012-02-02
    • 2011-07-23
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多