【发布时间】:2021-04-12 01:05:39
【问题描述】:
上下文
我有一个 json 作为条目,我想展开列表并展开嵌套在原始 json 中的字典。我在递归的explode/expand 方法中执行此操作,直到没有更多的嵌套列表/ dics。这是我的代码:
def one_step_parser(df):
# loop over all cols and explode lists
for col, _ in df.items():
# fill nan values with empty dics
df[col].fillna({i: {} for i in df.index}, inplace=True)
try:
df = df.explode(col)
except Exception as e:
pass
# loop over all columns and expand dics
for col, _ in df.items():
# fill nan values with empty dics
df[col] = df[col].fillna({i: {} for i in df.index})
try:
# this expands nested dics in df[col]
series = df[col].apply(pd.Series)
# rename new cols
series_cols = list(series.columns)
names = {}
for output_col in series_cols:
names[output_col] = str(col + "." + output_col)
series = series.rename(columns=names)
# concat the expanded result to the dataframe
df = pd.concat([df, series], axis=1)
df = df.drop(columns=[col])
except Exception as e:
pass
return df
def multi_step_parser(df_bf):
# first parse
df_af = one_step_parser(df_bf)
# keep parsing until there's no change
if len(list(df_bf.columns)) != len(list(df_af.columns)):
df_af = multi_step_parser(df_af)
return df_af
def parser(json_file):
json_f = open(json_file, 'r')
my_json = json.load(json_f)
# 1st json normaliziation to data frame
df = pd.json_normalize(my_json)
return multi_step_parser(df)
输入数据示例:
{
"agents": [
{
"core_build": "17",
"core_version": "7.1.1",
"distro": "win-x86-64",
"groups": [
{
"id": 101819,
"name": "O Laptops"
}
],
"id": 2198802,
"ip": "ip1",
"name": "x1x1x1x1",
"platform": "WINDOWS",
"plugin_feed_id": "201810182051",
"status": "on",
"uuid": "ca8b941a-80cd-4c1c-8044-760e69781eb7"
},
{
"core_build": "17",
"core_version": "7.1.1",
"distro": "win-x86-64",
"groups": [
{
"id": 101839,
"name": "windy"
},
{
"id": 102037,
"name": "W6"
},
{
"id": 102049,
"name": "MS8"
}
],
"id": 2097601,
"ip": "ip2s",
"name": "x2xx2x2x2",
"platform": "WINDOWS",
"plugin_feed_id": "201810181351",
"status": "on",
"uuid": "7e3ef1ff-4f08-445a-b500-e7ce3ca9a2f2"
},
{
"core_version": "7.1.0",
"distro": "win-x86-64",
"id": 2234103,
"ip": "x6x6x6x6x",
"last_connect": 1537384290,
"linked_on": 1537384247,
"name": "x7x7x7x",
"platform": "WINDOWS",
"status": "off",
"uuid": "0696ee38-402a-4866-b753-2816482dfce6"
}],
"pagination": {
"limit": 5000,
"offset": 0,
"sort": [
{
"name": "john",
"order": "asc"
},
{
"name": "mark",
"order": "dfg"
}
],
"total": 14416
}
}
请注意,这只是一个示例,我的输入可能会非常不同,因此我处理的内容与内容无关。
结果
pagination.limit pagination.offset pagination.total unique_index agents.core_build ... agents.linked_on pagination.sort.name pagination.sort.order agents.groups.id agents.groups.name
0 5000 0 14416 0 17 ... {} john asc 101819 O Laptops
0 5000 0 14416 0 17 ... {} mark dfg 101819 O Laptops
0 5000 0 14416 0 17 ... {} john asc 101839 windy
0 5000 0 14416 0 17 ... {} john asc 102037 W6
0 5000 0 14416 0 17 ... {} john asc 102049 MS8
0 5000 0 14416 0 17 ... {} mark dfg 101839 windy
0 5000 0 14416 0 17 ... {} mark dfg 102037 W6
0 5000 0 14416 0 17 ... {} mark dfg 102049 MS8
0 5000 0 14416 0 {} ... 1.5373... john asc {} {}
0 5000 0 14416 0 {} ... 1.5373... mark dfg {} {} ```
问题
如您所见,我所做的工作完成了。但是,当在大 json 上使用它时,执行时间会呈指数增长,因为每次迭代都会创建更多的行和列 + concat 在循环内。我想要的是使用apply 或numpy 矢量化使其更高效(尤其是第二个循环)。
尝试
我设法绕过fillna 函数列上的for 循环:
df = df.apply(lambda x: x.fillna({i: {} for i in df.index}))
但我找不到为此功能执行此操作的方法
series = df[col].apply(pd.Series)
【问题讨论】:
标签: python json pandas dataframe numpy