1 import pandas as pd
2 import numpy as np
3
4 # 加载数据——detail
5 detail_1 = pd.read_excel("./meal_order_detail.xlsx", sheetname=0)
6 detail_2 = pd.read_excel("./meal_order_detail.xlsx", sheetname=1)
7 detail_3 = pd.read_excel("./meal_order_detail.xlsx", sheetname=2)
8
9 print("detail_1 的形状: \n", detail_1.shape)
10 print("detail_1 的列索引: \n", detail_1.columns)
11 print("detail_2 的形状: \n", detail_2.shape)
12 print("detail_2 的列索引: \n", detail_2.columns)
13 print("detail_3 的形状: \n", detail_3.shape)
14 print("detail_3 的列索引: \n", detail_3.columns)
15
16 print("~"*60)
17 # 将detail_2, detail_3直接追加到detaiL_1下面
18 detail = pd.concat((detail_1, detail_2, detail_3), axis=0, join="inner")
19 print("detail的形状;\n", detail.shape)
20
21
22 # 加载info
23 info = pd.read_csv("./meal_order_info.csv", encoding="ansi")
24 print("info: \n", info.shape)
25
26 # info与detail进行主键拼接
27 res = pd.merge(left=detail, right=info, left_on="order_id", right_on="info_id", how="inner")
28 res = pd.merge(left=detail, right=info, left_on="order_id", right_on="info_id", how="left")
29 print("info与detail主键拼接的结果为: \n", res.shape)
30 print("res的列名: \n", res.columns)
31
32
33 # 加载users
34 users = pd.read_excel("./users.xlsx")
35 # info与detail进行主键拼接的结果与users进行主键拼接
36 res = pd.merge(left=res, right=users, left_on="name", right_on="ACCOUNT", how="inner")
37 print("最终进行主键拼接的结果: \n", res)
38 print("最终res的列名称: \n", res.columns)
39
40 print("name与ACCOUNT对比相同", np.all(res.loc[:, "name"] == res.loc[:, "ACCOUNT"]))
41 print("order_id与info_id对比相同", np.all(res.loc[:, "order_id"] == res.loc[:, "info_id"]))
42 print("emp_id_x与emp_id_y对比相同", np.all(res.loc[:, "emp_id_x"] == res.loc[:, "emp_id_y"]))
43
44 res.drop(labels=["ACCOUNT", "info_id", "emp_id_y"], axis=1, inplace=True)
45
46 print("删除3列之后的结果: \n", res.shape)
47 print("删除3列之后的结果: \n", res.columns)
48
49 drop_list = []
50 for column in res.columns:
51 # 统计每一列的非空数据的数量
52 res_count = res.loc[:, column].count()
53 # 如果整列非空数据的数量为0,意味着整列都是空的
54 if res_count == 0:
55 drop_list.append(column)
56
57 # 删除整列为空的列
58 res.drop(labels=drop_list, axis=1, inplace=True)
59 print("去除整列为空的数据之后的结果: \n", res.shape)
60 print("去除整列为空的数据之后的结果: \n", res.columns)
61
62 drop_dup_list = []
63 # 如果整列数据完全相同——该列, 该属性对于区分各列没有意义
64 for column in res.columns:
65 res_ = res.drop_duplicates(subset=column, inplace=False)
66 if res_.shape[0] == 1:
67 print("res_.shape[0]: \n", res_.shape[0])
68 drop_dup_list.append(column)
69
70 # 删除全部一样的列
71 res.drop(labels=drop_dup_list, axis=1, inplace=True)
72 print("最终的结果: \n", res.shape)
73 print("最终的结果: \n", res.columns)