我从https://python.tutorialink.com/can-i-perform-a-left-join-merge-between-two-dataframes-using-regular-expressions-with-pandas/ 那里得到了这个想法,并对其进行了一些改进,以便原始数据可以有多个列,现在我们可以使用正则表达式进行真正的左连接(合并)!
import pandas as pd
d = {'extra_colum1': ['x', 'y', 'z', 'w'],'field': ['ab', 'a', 'cd', 'e'], 'extra_colum2': ['x', 'y', 'z', 'w']}
df = pd.DataFrame(data=d)
df_dict = pd.DataFrame(['a', 'b', 'c', 'd'], columns =
['destination'])
df_dict['field'] = '.*' + df_dict['destination'] + '.*'
df_dict.columns=['destination','field']
dataframe and dict
def merge_regex(df, df_dict, how, field):
import re
df_dict = df_dict.drop_duplicates()
idx = [(i,j) for i,r in enumerate(df_dict[f'{field}']) for j,v in enumerate(df[f'{field}']) if re.match(r,v)]
df_dict_idx, df_idx = zip(*idx)
t = df_dict.iloc[list(df_dict_idx),0].reset_index(drop=True)
t1 = df.iloc[list(df_idx),df.columns.get_loc(f'{field}')].reset_index(drop=True)
df_dict_translated = pd.concat([t,t1], axis=1)
data = pd.merge(
df,
df_dict_translated,
how=f'{how}',
left_on=f'{field}',
right_on=f'{field}'
)
data = data.drop_duplicates()
return data