【问题标题】:Python XML Sax to dictionaryPython XML Sax 到字典
【发布时间】:2011-10-21 23:08:45
【问题描述】:

我正在尝试使用 Python 中的标准 sax 解析器解析一些相对较大的 xml 文件,我希望避免手动将每个元素保存/检查到字典中,因为 我正在使用多个 xml 模式,其中一些非常大。

显然下面的代码示例不起作用,但这是我目前得到的。也欢迎其他低内存解决方案。

(注意:完整的xml文件包含不止两层嵌套结构)

from xml import sax
from cStringIO import StringIO

xml_string = """<?xml version="1.0" encoding="iso-8859-1"?>
<n1:products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:n7="http://foo.bar.tld" xmlns:n1="http://foo.bar.tld">
  <n1:product>
    <n1:status>
      <n7:created>2005-02-08T18:18:30.53</n7:created>
      <n7:updated>2008-09-18T10:29:58.26</n7:updated>
    </n1:status>
    <n1:productid>28321503</n1:productid>
    <n1:producttext>
      <n7:text>Some product info</n7:text>
      <n7:type>Info</n7:type>
    </n1:producttext>
    <n1:terms>
      <n7:term>
        <n7:number>1</n7:number>
        <n7:name>Term1</n7:name>
      </n7:term>
      <n7:term>
        <n7:number>2</n7:number>
        <n7:name>Term2</n7:name>
      </n7:term>
    </n1:terms>   
  </n1:product>
</n1:products>
"""

class XML_Handler(sax.ContentHandler):    
    def __init__(self):
        self.data = {}
        self.vbuffer = ''
    def startElementNS(self, name, qname, attrs):
        (ns, localname) = name
        if localname == 'product':
            self.data = {}
            self.fetch = True
    def endElementNS(self, name, qname):
        (ns, localname) = name
        if localname == 'product':
            # Got my data, call some process function..
            print self.data
        elif self.fetch:
            if self.vbuffer != '':
                self.data[localname] = self.vbuffer
            else:
                pass
        self.vbuffer = ''
    def characters (self, ch):
        self.vbuffer += ch.rstrip()

if __name__ == '__main__':
    parser = sax.make_parser()
    parser.setContentHandler(XML_Handler())
    parser.setFeature(sax.handler.feature_namespaces, 1)
    inpsrc = sax.xmlreader.InputSource()
    inpsrc.setByteStream(StringIO(xml_string))
    parser.parse(inpsrc)

我想要达到的目标:

result = {
    'status' : {
        'created' : '2005-02-08T18:18:30.53',
        'updated' : '2008-09-18T10:29:58.26',
    },
    'productid' : '28321503',
    'producttext' : {
        'text' : 'Some product',
        'type' : 'Info',
    },
    'terms' : [{'number': '1', 'name': 'Term1'}, {'number': '2', 'name': 'Term2'}]
}

【问题讨论】:

    标签: python xml sax


    【解决方案1】:

    https://www.assembla.com/code/pysnipps/subversion/nodes/python/mXMLDao.py 很久以前,我做了一个糟糕的库,用于将 xml 映射到类中,也许它可以帮助你。

    【讨论】:

      【解决方案2】:

      终于搞定了。它可能不是最强大的解决方案,但对于我的用例来说已经足够了。

      #!/usr/bin/env python
      # -*- coding: utf-8 -*-
      
      import simplejson as json
      from xml import sax
      try:
          from cStringIO import StringIO
      except ImportError:
          from StringIO import StringIO
      
      xml_string = '''<?xml version="1.0" encoding="iso-8859-1"?>
      <n1:products xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:n7="http://foo.bar.tld" xmlns:n1="http://foo.bar.tld">
        <n1:product>
          <n1:status>
            <n7:created>2005-02-08T18:18:30.53</n7:created>
            <n7:updated>2008-09-18T10:29:58.26</n7:updated>
          </n1:status>
          <n1:productid>28321503</n1:productid>
          <n1:producttext>
            <n7:text>Some product info</n7:text>
            <n7:type>Info</n7:type>
          </n1:producttext>
          <n1:terms>
            <n7:term>
              <n7:number>1</n7:number>
              <n7:name>Term1</n7:name>
            </n7:term>
            <n7:term>
              <n7:number>2</n7:number>
              <n7:name>Term2</n7:name>
            </n7:term>
          </n1:terms>   
        </n1:product>
      </n1:products>
      '''
      
      def display(data):
          import pprint
          pp = pprint.PrettyPrinter(depth=10)
          pp.pprint(data)
      
      class Element:
          def setData(self, key, value):
              self.__dict__[key] = value
      
          def setObject(self, key, object):
              if key in self.__dict__ and not isinstance(self.__dict__[key], (list, tuple)):
                  prev_object = self.__dict__[key]
                  self.__dict__[key] = []
                  self.__dict__[key].append(prev_object)
                  self.__dict__[key].append(object)
              elif key in self.__dict__:
                  self.__dict__[key].append(object)
              else:
                  self.__dict__[key] = object
      
          def jsonable(self):
              return self._traverse(self.__dict__)
      
          # http://stackoverflow.com/questions/1036409/recursively-convert-python-object-graph-to-dictionary/1118038#1118038
          def _traverse(self, obj):
              if isinstance(obj, dict):
                  for k in obj.keys():
                      obj[k] = self._traverse(obj[k])
                  return obj
              elif hasattr(obj, "__iter__"):
                  return [self._traverse(v) for v in obj]
              elif hasattr(obj, "__dict__"):
                  data = dict([(key, self._traverse(value))
                      for key, value in obj.__dict__.iteritems()
                      if not callable(value) and not key.startswith('_')])
                  return data
              else:
                  return obj
      
      class ObjBuilder(sax.ContentHandler):
          def __init__(self, node):
              sax.ContentHandler.__init__(self)
              self.obj = []
              self.node = node
              self.fetch = False
              self.__buffer = ''
      
          def startElementNS(self, name, qname, attrs):
              (ns, localname) = name
              if self.node == localname:
                  self.fetch = True
                  o = Element()
                  self.rootobject = o
                  self.obj.append(o)
              elif self.fetch:
                  self.__buffer = ''
                  o = Element()
                  self.obj[-1].setObject(localname, o)
                  self.obj.append(o)
      
          def characters(self,contents):
              if self.fetch:
                  self.__buffer += contents.strip()
      
          def endElementNS(self, name, qname):
              (ns, localname) = name
              if self.node == localname:
                  self.fetch = False
                  display(self.rootobject.jsonable())
                  data = self.rootobject.jsonable()
              elif self.fetch:
                  if self.__buffer != '':
                      self.obj[-2].setData(localname, self.__buffer)
                  del self.obj[-1]
                  self.__buffer = ''
      
      if __name__ == '__main__':
          parser = sax.make_parser()
          parser.setContentHandler(ObjBuilder('product'))
          parser.setFeature(sax.handler.feature_namespaces, 1)
      
          inpsrc = sax.xmlreader.InputSource()
          inpsrc.setByteStream(StringIO(xml_string))
          parser.parse(inpsrc)
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 2015-04-25
        • 1970-01-01
        • 2012-04-06
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2014-10-04
        相关资源
        最近更新 更多