【问题标题】:Extract multi-line javascript content from <script> tag using Scrapy使用 Scrapy 从 <script> 标签中提取多行 javascript 内容
【发布时间】:2015-03-01 12:41:24
【问题描述】:

我正在尝试使用 Scrapy 从此脚本标签中提取数据:

<script>
        var hardwareTemplateFunctions;
        var storefrontContextUrl = '';

        jq(function() {
            var data = new Object();
            data.hardwareProductCode = '9054832';
            data.offeringCode = 'SMART_BASIC.TLF12PLEAS';
            data.defaultTab = '';
            data.categoryId = 10001;

            data.bundles = new Object();
                            data.bundles['SMART_SUPERX.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('1099'),
                    monthlyPrice: parsePrice('499'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Super',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('499'),
                    commitmentTime: 12
                };
                            data.bundles['SMART_PLUSS.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('1599'),
                    monthlyPrice: parsePrice('399'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Pluss',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('399'),
                    commitmentTime: 12
                };
                            data.bundles['SMART_BASIC.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('2199'),
                    monthlyPrice: parsePrice('299'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Basis',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('299'),
                    commitmentTime: 12
                };
                            data.bundles['SMART_MINI.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('2999'),
                    monthlyPrice: parsePrice('199'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Mini',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('199'),
                    commitmentTime: 12
                };
                            data.bundles['KONTANT_KOMPLETT.REGULAR'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('0'),
                    upfrontPrice: parsePrice('3499'),
                    monthlyPrice: parsePrice('0'),
                    commitmentTime: parsePrice('0'),
                    offeringTitle: 'SMART Kontant',
                    offeringType: 'PREPAID',
                    monthlyPrice: parsePrice('0'),
                    commitmentTime: 0
                };

            data.reviewJson = new Object();


            hardwareTemplateFunctions = hardwareTemplateFunctions(data);
            hardwareTemplateFunctions.init();

            data.reviewSummaryBox = hardwareTemplateFunctions.reviewSummaryBox;

            accessoryFunctions(data).init();
            additionalServiceFunctions(data).init();
        });

        function parsePrice(str) {
            var price = parseFloat(str);
            return isNaN(price) ? 0 : price;
        }

        var offerings = {};
    </script>

我想从每个部分获取如下所示的数据:

 data.bundles['SMART_SUPERX.TLF12PLEAS'] = {
                signupFee: parsePrice('0'),
                newMsisdnFee: parsePrice('199'),
                upfrontPrice: parsePrice('1099'),
                monthlyPrice: parsePrice('499'),
                commitmentTime: parsePrice('12'),
                offeringTitle: 'SMART Super',
                offeringType: 'VOICE',
                monthlyPrice: parsePrice('499'),
                commitmentTime: 12
            };

然后从每个字段中获取数据并从例如upfrontPrice(例如本例中的 1099)获取最终数据。

我已经尝试使用这个来获取每个对象:

items = response.xpath('//script/text()').re("data.bundles\[.*\](.*)")

但是,这只给了我第一行数据。 (= {)。那么我该怎么做呢?有没有更好的方法从脚本标签中提取这些数据?

编辑:当我使用items = response.xpath('//script/text()').re("data.bundles\[.*\] = {((?s).*) };") 时,我似乎只得到最后一个块(带有data.bundles['KONTANT_KOMPLETT.REGULAR'] 的块)

我如何获得所有这些的列表?

【问题讨论】:

  • 默认情况下,python 正则表达式 . 匹配除换行符以外的任何内容。例如,您可以使用([^}]*) - 没有} 的字符序列。
  • 我能够通过使用 selenium 执行您的 javascript 代码来获得结果(它需要一些调整)。结果是 data dict 及其所有成员(包括 categoryId 等)。它将需要在服务器端安装 Firefox 才能运行 Javascript。它对你有用吗?
  • @avenet 我不想使用硒。但是,如果你给我一个例子,我会考虑它。还有没有办法在没有硒的情况下将对象从字典中取出?

标签: javascript python regex scrapy


【解决方案1】:

如果您不想玩弄正则表达式,可以使用js2xml,它会解析 Javascript 代码并将其转换为 lxml 文档。 然后,您可以使用 XPath 从 Javascript 语句中查询内容。 (免责声明:我编写和维护 js2xml)

以下是有关如何获取 data.bundles 分配的示例代码:

import scrapy

selector = scrapy.Selector(text="""<script>
        var hardwareTemplateFunctions;
        var storefrontContextUrl = '';

        jq(function() {
            var data = new Object();
            data.hardwareProductCode = '9054832';
            data.offeringCode = 'SMART_BASIC.TLF12PLEAS';
            data.defaultTab = '';
            data.categoryId = 10001;

            data.bundles = new Object();
                            data.bundles['SMART_SUPERX.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('1099'),
                    monthlyPrice: parsePrice('499'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Super',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('499'),
                    commitmentTime: 12
                };
                            data.bundles['SMART_PLUSS.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('1599'),
                    monthlyPrice: parsePrice('399'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Pluss',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('399'),
                    commitmentTime: 12
                };
                            data.bundles['SMART_BASIC.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('2199'),
                    monthlyPrice: parsePrice('299'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Basis',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('299'),
                    commitmentTime: 12
                };
                            data.bundles['SMART_MINI.TLF12PLEAS'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('199'),
                    upfrontPrice: parsePrice('2999'),
                    monthlyPrice: parsePrice('199'),
                    commitmentTime: parsePrice('12'),
                    offeringTitle: 'SMART Mini',
                    offeringType: 'VOICE',
                    monthlyPrice: parsePrice('199'),
                    commitmentTime: 12
                };
                            data.bundles['KONTANT_KOMPLETT.REGULAR'] = {
                    signupFee: parsePrice('0'),
                    newMsisdnFee: parsePrice('0'),
                    upfrontPrice: parsePrice('3499'),
                    monthlyPrice: parsePrice('0'),
                    commitmentTime: parsePrice('0'),
                    offeringTitle: 'SMART Kontant',
                    offeringType: 'PREPAID',
                    monthlyPrice: parsePrice('0'),
                    commitmentTime: 0
                };

            data.reviewJson = new Object();


            hardwareTemplateFunctions = hardwareTemplateFunctions(data);
            hardwareTemplateFunctions.init();

            data.reviewSummaryBox = hardwareTemplateFunctions.reviewSummaryBox;

            accessoryFunctions(data).init();
            additionalServiceFunctions(data).init();
        });

        function parsePrice(str) {
            var price = parseFloat(str);
            return isNaN(price) ? 0 : price;
        }

        var offerings = {};
    </script>""")

(这第一部分是在 Scrapy Selector 中获取 HTML 输入)

import js2xml
import pprint

data_bundles = {}
for script in selector.xpath('//script/text()').extract():
    # this is how you turn Javascript code into an XML document (lxml document in fact)
    jstree = js2xml.parse(script)

    # then, we're interested in assignments of data.bundles object
    for a in jstree.xpath('//assign[left//property/identifier/@name="bundles" and right/object]'):
        # the assigned property is give by a <string> property from a <bracketaccessor>
        bundle_prop = a.xpath('./left/bracketaccessor/property/string/text()')
        if bundle_prop is not None:
            curr_prop = bundle_prop[0]

        data_bundles[curr_prop] = {}

        # the left object is assigned an object (inside a <right> element)
        # let's loop on the <property> elements)
        # the values are either numbers or string arguments of a function call
        for prop in a.xpath('./right/object/property'):
            data_bundles[curr_prop][prop.xpath('@name')[0]] = prop.xpath('.//number/@value | .//string/text()')[0]

pprint.pprint(data_bundles)

这就是你从中得到的:

{'KONTANT_KOMPLETT.REGULAR': {'commitmentTime': '0',
                              'monthlyPrice': '0',
                              'newMsisdnFee': '0',
                              'offeringTitle': 'SMART Kontant',
                              'offeringType': 'PREPAID',
                              'signupFee': '0',
                              'upfrontPrice': '3499'},
 'SMART_BASIC.TLF12PLEAS': {'commitmentTime': '12',
                            'monthlyPrice': '299',
                            'newMsisdnFee': '199',
                            'offeringTitle': 'SMART Basis',
                            'offeringType': 'VOICE',
                            'signupFee': '0',
                            'upfrontPrice': '2199'},
 'SMART_MINI.TLF12PLEAS': {'commitmentTime': '12',
                           'monthlyPrice': '199',
                           'newMsisdnFee': '199',
                           'offeringTitle': 'SMART Mini',
                           'offeringType': 'VOICE',
                           'signupFee': '0',
                           'upfrontPrice': '2999'},
 'SMART_PLUSS.TLF12PLEAS': {'commitmentTime': '12',
                            'monthlyPrice': '399',
                            'newMsisdnFee': '199',
                            'offeringTitle': 'SMART Pluss',
                            'offeringType': 'VOICE',
                            'signupFee': '0',
                            'upfrontPrice': '1599'},
 'SMART_SUPERX.TLF12PLEAS': {'commitmentTime': '12',
                             'monthlyPrice': '499',
                             'newMsisdnFee': '199',
                             'offeringTitle': 'SMART Super',
                             'offeringType': 'VOICE',
                             'signupFee': '0',
                             'upfrontPrice': '1099'}}

有关您通过js2xml.parse() 获得的XML 架构的更多信息,您可以查看https://github.com/redapple/js2xml/blob/master/SCHEMA.rst

【讨论】:

    【解决方案2】:

    以下正则表达式似乎是正确的:

    r"data\.bundles\[[^\]]*\] = {([^}]*)}"
    

    正则表达式中的* 是贪婪的——它总是尽可能地匹配,所以我使用[^\]] 来确保我会匹配最接近的]。我对{} 括号也这样做。此外,我不必担心. 不匹配换行符。

    【讨论】:

      【解决方案3】:

      此脚本需要安装Mozilla Firefoxpython-selenium,我也使用名为script.txt 的文件进行了测试,该文件包含由标签包围的脚本。代码如下:

      from selenium import webdriver
      
      script_content = open("script.txt").read()
      
      #Removing script tags
      exec_script = script_content.replace("<script>", "").replace("</script>", "")
      
      #Removing jq function call
      exec_script = exec_script.replace("jq(function() {", "").replace("});", "")
      
      #Setting some helper functions to avoid javascript errors
      helper_functions = """function hardwareTemplateFunctions(){
                           return {init: function(){}};};  
                           accessoryFunctions = additionalServiceFunctions = 
                           hardwareTemplateFunctions;"""
      
      #Returning data variable
      return_statement = "return data;"
      
      wd = webdriver.Firefox()
      
      #Getting data variable in result
      result = wd.execute_script(helper_functions + exec_script +  return_statement)
      

      结果变量如下所示:

      {u'bundles': {u'KONTANT_KOMPLETT.REGULAR': {u'commitmentTime': 0,
         u'monthlyPrice': 0,
         u'newMsisdnFee': 0,
         u'offeringTitle': u'SMART Kontant',
         u'offeringType': u'PREPAID',
         u'signupFee': 0,
         u'upfrontPrice': 3499},
        u'SMART_BASIC.TLF12PLEAS': {u'commitmentTime': 12,
         u'monthlyPrice': 299,
         u'newMsisdnFee': 199,
         u'offeringTitle': u'SMART Basis',
         u'offeringType': u'VOICE',
         u'signupFee': 0,
         u'upfrontPrice': 2199},
        u'SMART_MINI.TLF12PLEAS': {u'commitmentTime': 12,
         u'monthlyPrice': 199,
         u'newMsisdnFee': 199,
         u'offeringTitle': u'SMART Mini',
         u'offeringType': u'VOICE',
         u'signupFee': 0,
         u'upfrontPrice': 2999},
        u'SMART_PLUSS.TLF12PLEAS': {u'commitmentTime': 12,
         u'monthlyPrice': 399,
         u'newMsisdnFee': 199,
         u'offeringTitle': u'SMART Pluss',
         u'offeringType': u'VOICE',
         u'signupFee': 0,
         u'upfrontPrice': 1599},
        u'SMART_SUPERX.TLF12PLEAS': {u'commitmentTime': 12,
         u'monthlyPrice': 499,
         u'newMsisdnFee': 199,
         u'offeringTitle': u'SMART Super',
         u'offeringType': u'VOICE',
         u'signupFee': 0,
         u'upfrontPrice': 1099}},
       u'categoryId': 10001,
       u'defaultTab': u'',
       u'hardwareProductCode': u'9054832',
       u'offeringCode': u'SMART_BASIC.TLF12PLEAS',
       u'reviewJson': {},
       u'reviewSummaryBox': None}
      

      【讨论】:

      • 也感谢您的回答。我最终没有使用它,但它是未来项目的一个很好的解决方案。 :)
      猜你喜欢
      • 2016-05-08
      • 1970-01-01
      • 2019-05-11
      • 2016-02-03
      • 2021-04-21
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多