【发布时间】:2019-09-30 07:50:32
【问题描述】:
我在 python 2.7.13 中使用 ruaml.yaml 版本 0.15.74。由于外部给定的限制,我必须使用这种版本。
我的终极目标是读取 yaml 文件并选择其中的某些部分,将其保存在 pandas 数据框中,最后将其写入 csv 文件。为此,我有以下自定义的“DoubleMergeKeyEnabler(object)”。
import pandas as pd
import ruamel.yaml
import json
import os
yaml = ruamel.yaml.YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes=True
yaml.width = 100000
class DoubleMergeKeyEnabler(object):
def __init__(self):
self.pat = '<<: ' # could be at the root level mapping, so no leading space
self.r_pat = '[<<, {}]: ' # probably not using sequences as keys
self.pat_nr = -1
def convert(self, doc):
while self.pat in doc:
self.pat_nr += 1
doc = doc.replace(self.pat, self.r_pat.format(self.pat_nr), 1)
return doc
def revert(self, doc):
while self.pat_nr >= 0:
doc = doc.replace(self.r_pat.format(self.pat_nr), self.pat, 1)
self.pat_nr -= 1
return doc
dmke = DoubleMergeKeyEnabler()
我使用以下方式加载 yaml 文件:
df = pd.DataFrame(columns=['text1', 'text2'])
with open ('test.yaml' as f:
data = yaml.load(f)
然后我选择我的 yaml 文件的特定部分并尝试定义一个 id 来跟踪它(将是 pandas 数据框条目名称的名称)并将其存储在 pandas 数据框中。
_item = data.get('items')
for i in range(0, len(_item)):
if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
_id = 'test' + '_' + 'items' + '_' + str(_item[i].get('representation')).replace(" ","_")
_txt_to_trans = _item[i].get('representation')
df.loc[_id] = [_txt_to_trans, '']
以下是 yaml 文件的给出方式。这个我也改不了。
groups:
- &group-dp
title: "Abschätzungen"
reference: "group-dp"
required: true
description: >
help_text: |
items:
- type: "Group"
<<: *group-dp
visible: true
multiple: false
representation: "Abschätzungen"
我收到以下错误消息
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
<ipython-input-18-1fa5952ce8cf> in <module>()
----> 1 import codecs, os;__pyfile = codecs.open('''/tmp/py7455hqj''', encoding='''utf-8''');__code = __pyfile.read().encode('''utf-8''');__pyfile.close();os.remove('''/tmp/py7455hqj''');exec(compile(__code, '''/home/nicolas/Desktop/test.py''', 'exec'));
/home/nicolas/Desktop/test.py in <module>()
39 _item = data.get('items')
40 for i in range(0, len(_item)):
---> 41 if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
42 _id = 'test' + '_' + 'items' + '_' + str(_item[i].get('representation')).replace(" ","_")
43 _txt_to_trans = _item[i].get('representation')
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe4' in position 5: ordinal not in range(128)
In [19]:
我不知何故需要解码,但这不起作用。我该如何解决这个问题?完整的测试代码如下所示
import pandas as pd
import ruamel.yaml
import json
import os
yaml = ruamel.yaml.YAML()
yaml.indent(mapping=2, sequence=4, offset=2)
yaml.preserve_quotes=True
yaml.width = 100000
class DoubleMergeKeyEnabler(object):
def __init__(self):
self.pat = '<<: ' # could be at the root level mapping, so no leading space
self.r_pat = '[<<, {}]: ' # probably not using sequences as keys
self.pat_nr = -1
def convert(self, doc):
while self.pat in doc:
self.pat_nr += 1
doc = doc.replace(self.pat, self.r_pat.format(self.pat_nr), 1)
return doc
def revert(self, doc):
while self.pat_nr >= 0:
doc = doc.replace(self.r_pat.format(self.pat_nr), self.pat, 1)
self.pat_nr -= 1
return doc
dmke = DoubleMergeKeyEnabler()
df = pd.DataFrame(columns=['text1', 'text2'])
with open ('/home/nicolas/Desktop/test.yaml') as f:
data = yaml.load(f)
_item = data.get('items')
for i in range(0, len(_item)):
if 'representation' in _item[i].keys() and isinstance(_item[i].get('representation'), ruamel.yaml.scalarstring.DoubleQuotedScalarString):
_id = 'test' + '_' + 'items' + '_' + str(_item[i].get('representation')).replace(" ","_")
_txt_to_trans = _item[i].get('representation')
df.loc[_id] = [_txt_to_trans, '']
【问题讨论】: