我相信这会奏效,它是用 Python3 编写的,我没有对其进行优化,但如果速度不够快,它可能是一个很好的起点。
list_of_dicts = [
{'id':'id1', 'key1':'value_x', 'key2': 'value_y', 'key3':'value_z'},
{'id':'id3', 'key2' :'value_u', 'key3': 'value_v'},
{'id':'id2', 'key1':'value_x', 'key3':'value_z', 'key4': 'value_t'},
{'id':'id4', 'key1':'value_w', 'key2':'value_s', 'key3':'value_v'}
]
# Since we can't have objects as keys, make the values we're looking for into a string, and use that as the key.
def make_value_key(d, list_of_keys):
res = ""
for k in list_of_keys:
res += str(d[k])
return res
def group_dictionary(list_of_dicts, list_of_keys):
group_vals = {}
current_max_group = 0
dicts_to_remove = []
for i,d in enumerate(list_of_dicts):
# If dict doesn't have all keys mark for removal.
if not all(k in d for k in list_of_keys):
dicts_to_remove.append(i)
else:
value_key = make_value_key(d, list_of_keys)
# If value key exists assign group otherwise make new group.
if value_key in group_vals:
d['group'] = group_vals[value_key]
else:
group_vals[value_key] = current_max_group
d['group'] = current_max_group
current_max_group += 1
list_of_dicts = [i for j, i in enumerate(list_of_dicts) if j not in dicts_to_remove]
return list_of_dicts
list_of_keys=['key1','key3']
print(group_dictionary(list_of_dicts, list_of_keys))
print()
list_of_keys=['key3']
print(group_dictionary(list_of_dicts, list_of_keys))
输出:
[{'key3': 'value_z', 'key1': 'value_x', 'group': 0, 'key2': 'value_y', 'id': 'id1'},
{'key3': 'value_z', 'key1': 'value_x', 'key4': 'value_t', 'group': 0, 'id': 'id2'},
{'key3': 'value_v', 'key1': 'value_w', 'group': 1, 'key2': 'value_s', 'id': 'id4'}]
[{'key3': 'value_z', 'key1': 'value_x', 'group': 0, 'key2': 'value_y', 'id': 'id1'},
{'group': 1, 'key3': 'value_v', 'key2': 'value_u', 'id': 'id3'},
{'key3': 'value_z', 'key1': 'value_x', 'key4': 'value_t', 'group': 0, 'id': 'id2'},
{'key3': 'value_v', 'key1': 'value_w', 'group': 1, 'key2': 'value_s', 'id': 'id4'}]
优化一:
与其迭代所有键来检查它们是否存在,相反,我们可以在创建值键时失败并返回一个空字符串,这会将字典标记为删除:
def make_value_key(d, list_of_keys):
res = ""
for k in list_of_keys:
if not k in d:
return ""
res += str(d[k])
return res
def group_dictionary(list_of_dicts, list_of_keys):
group_vals = {}
current_max_group = 0
dicts_to_remove = []
for i,d in enumerate(list_of_dicts):
value_key = make_value_key(d, list_of_keys)
if value_key == "":
dicts_to_remove.append(i)
continue
if value_key in group_vals:
d['group'] = group_vals[value_key]
else:
group_vals[value_key] = current_max_group
d['group'] = current_max_group
current_max_group += 1
list_of_dicts = [i for j, i in enumerate(list_of_dicts) if j not in dicts_to_remove]
return list_of_dicts
组必须大于 1:
这使用第二个字典来跟踪组大小,然后检查组是否小于 2 以将它们标记为删除。
def make_value_key(d, list_of_keys):
res = ""
for k in list_of_keys:
if not k in d:
return ""
res += str(d[k])
return res
def group_dictionary(list_of_dicts, list_of_keys):
group_vals = {}
group_count = {}
current_max_group = 0
indices_to_remove = []
for i,d in enumerate(list_of_dicts):
value_key = make_value_key(d, list_of_keys)
if value_key == "":
indices_to_remove.append(i)
continue
if value_key in group_vals:
d['group'] = group_vals[value_key]
# Second group member seen, remove from count dict.
group_count.pop(d['group'], None)
else:
group_vals[value_key] = current_max_group
d['group'] = current_max_group
# First time seen, add to count dict.
group_count[current_max_group] = i
current_max_group += 1
indices_to_remove.extend(group_count.values())
return [i for j, i in enumerate(list_of_dicts) if j not in indices_to_remove]
输出:
[{'key2': 'value_y', 'group': 0, 'id': 'id1', 'key1': 'value_x', 'key3': 'value_z'},
{'key4': 'value_t', 'group': 0, 'id': 'id2', 'key1': 'value_x', 'key3': 'value_z'}]
[{'key2': 'value_y', 'group': 0, 'id': 'id1', 'key1': 'value_x', 'key3': 'value_z'}, {'group': 1, 'id': 'id3', 'key2': 'value_u', 'key3': 'value_v'}, {'key4': 'value_t', 'group': 0, 'id': 'id2', 'key1': 'value_x', 'key3': 'value_z'}, {'key2': 'value_s', 'group': 1, 'id': 'id4', 'key1': 'value_w', 'key3': 'value_v'}]
优化 2:
您可以从O(n^2)(循环遍历字典列表一次计算,一次删除)到O(n*m log m)(循环遍历字典列表一次,循环遍历排序的删除索引):
def make_value_key(d, list_of_keys):
res = ""
for k in list_of_keys:
if not k in d:
return ""
res += str(d[k])
return res
def group_dictionary(list_of_dicts, list_of_keys):
group_vals = {}
group_count = {}
current_max_group = 0
indices_to_remove = []
for i,d in enumerate(list_of_dicts):
value_key = make_value_key(d, list_of_keys)
if value_key == "":
indices_to_remove.append(i)
continue
if value_key in group_vals:
d['group'] = group_vals[value_key]
# Second group member seen, remove from count dict.
group_count.pop(d['group'], None)
else:
group_vals[value_key] = current_max_group
d['group'] = current_max_group
# First time seen, add to count dict.
group_count[current_max_group] = i
current_max_group += 1
indices_to_remove.extend(group_count.values())
for index in sorted(indices_to_remove, reverse=True):
del list_of_dicts[index]
return list_of_dicts