import numpy as np
from math import log
(一)创建数据集
# 创建数据集
def createDataSet():
dataSet = [[1, 1],
[1, 1],
[1, 0],
[0, 1],
[0, 1]]
labels = [1, 1, 0, 0, 0]
features_names = ['水下', '脚蹼'] # 特征名称
return dataSet, labels, features_names
(二)计算信息熵
#计算信息熵
def calcEnt(data_Y): #传入numpy数据
cnt = len(data_Y)
EntVal = 0.0
val_nums = np.unique(data_Y)
for val in val_nums:
num = data_Y[np.where(data_Y==val)].size
EntVal += num/cnt*log(num/cnt,2)
return -EntVal
(三)根据信息增益获取特征
#根据信息熵,获取最好的特征
def chooseBestFeature(data_X,data_Y):
samp_num,fea_num = data_X.shape #统计样本数、特征数
#循环统计每个特征的信息增益
BaseEntval = calcEnt(data_Y)
BestFeature = -1;BestEntGain=0.0
for i in range(fea_num): #开始循环特征
newEntVal = 0.0 #获取每个特征的信息熵
val_nums = np.unique(data_X[:,i]) #先获取该特征下的值种类
for val in val_nums:
new_dataY = data_Y[np.where(data_X[:, i] == val)]
newEntVal += new_dataY.size/samp_num*calcEnt(new_dataY)
if BaseEntval - newEntVal > BestEntGain: #比较信息增益大小
BestEntGain = BaseEntval - newEntVal
BestFeature = i
return BestFeature
(四)按特征和特征的值进行子集划分
#按特征和特征的值进行子集划分 注意:最后返回的子集中不包含原来的特征
def splitDataByFeature(data_X,data_Y,fea_idx,fea_axis,fea_val): #注意fea_idx保存了特征原始索引
new_dataX = data_X[np.where(data_X[:,fea_axis]==fea_val)]
new_dataY = data_Y[np.where(data_X[:, fea_axis] == fea_val)]
new_dataX = np.delete(new_dataX,fea_axis,1) #按列删除特征
new_feaIdx = np.delete(fea_idx,fea_axis,0) #按行删除 (2,)
return new_dataX,new_dataY,new_feaIdx #因为先获取了new_dataY,之后才删除的该特征列,所以必然存在new_dataX为空,new_dataY不为空情况
(五)创建决策树
#开始递归创建树
def createTree(data_X,data_Y,fea_idx):
y_valNums = np.unique(data_Y) #值去重
if y_valNums.size == 1: #全是一个类别,直接返回该类别
return np.int(data_Y[0])
if data_X.shape[1] == 0: #如果该递归路径下,已经遍历了所有特征,使用多数投票进行分类返回(这里和splitDataByFeature有关)
bestCls,bestCnt = 0,0
for i in y_valNums:
if data_Y[np.where(data_Y==i)].size > bestCnt:
bestCls = i
return bestCls
#可以进行递归了
BestFeature = chooseBestFeature(data_X,data_Y)
my_tree = {fea_idx[BestFeature]:{}}
uniFeaVals = np.unique(data_X[:,BestFeature])
for i in uniFeaVals:
new_dataX,new_dataY,new_feaIdx = splitDataByFeature(data_X,data_Y,fea_idx,BestFeature,i) #获取新的划分子集,注意,因为我们是先获取Y标签,然后才删除X矩阵该特征列。所以会出现data_X为空,data_Y不为空情况
my_tree[fea_idx[BestFeature]][i] = createTree(new_dataX,new_dataY,new_feaIdx)
return my_tree
(六)实现预测
def classify(inputTree,testVec): #递归查找树,直到找到叶子节点
rootTag = list(inputTree.keys())[0] #获取根节点信息,看先找的哪一个特征 --- 获取索引
FeaVal = inputTree[rootTag] #获取该根节点全部特性值 --- 获取值
for k in FeaVal.keys():
if k == testVec[rootTag]:
if type(inputTree[rootTag][k]) != dict:
return inputTree[rootTag][k]
return classify(inputTree[rootTag][k],testVec)
(七)进行测试
data_x,data_y,fea_names = createDataSet()
fea_Idx = np.arange(len(fea_names))
data_X,data_Y = np.array(data_x),np.array([data_y]).T
myTree = createTree(data_X,data_Y,fea_Idx)
print(myTree)
testData = np.zeros(len(fea_names))
for i in range(len(fea_names)):
testData[i] = input("{}(0/1)>:".format(fea_names[i]))
print(classify(myTree,testData))


(八)全部代码

import numpy as np
from math import log
# 创建数据集
def createDataSet():
dataSet = [[1, 1],
[1, 1],
[1, 0],
[0, 1],
[0, 1]]
labels = [1, 1, 0, 0, 0]
features_names = ['水下', '脚蹼'] # 特征名称
return dataSet, labels, features_names
#计算信息熵
def calcEnt(data_Y): #传入numpy数据
cnt = len(data_Y)
EntVal = 0.0
val_nums = np.unique(data_Y)
for val in val_nums:
num = data_Y[np.where(data_Y==val)].size
EntVal += num/cnt*log(num/cnt,2)
return -EntVal
#根据信息熵,获取最好的特征
def chooseBestFeature(data_X,data_Y):
samp_num,fea_num = data_X.shape #统计样本数、特征数
#循环统计每个特征的信息增益
BaseEntval = calcEnt(data_Y)
BestFeature = -1;BestEntGain=0.0
for i in range(fea_num): #开始循环特征
newEntVal = 0.0 #获取每个特征的信息熵
val_nums = np.unique(data_X[:,i]) #先获取该特征下的值种类
for val in val_nums:
new_dataY = data_Y[np.where(data_X[:, i] == val)]
newEntVal += new_dataY.size/samp_num*calcEnt(new_dataY)
if BaseEntval - newEntVal > BestEntGain: #比较信息增益大小
BestEntGain = BaseEntval - newEntVal
BestFeature = i
return BestFeature
#按特征和特征的值进行子集划分 注意:最后返回的子集中不包含原来的特征
def splitDataByFeature(data_X,data_Y,fea_idx,fea_axis,fea_val): #注意fea_idx保存了特征原始索引
new_dataX = data_X[np.where(data_X[:,fea_axis]==fea_val)]
new_dataY = data_Y[np.where(data_X[:, fea_axis] == fea_val)]
new_dataX = np.delete(new_dataX,fea_axis,1) #按列删除特征
new_feaIdx = np.delete(fea_idx,fea_axis,0) #按行删除 (2,)
return new_dataX,new_dataY,new_feaIdx #因为先获取了new_dataY,之后才删除的该特征列,所以必然存在new_dataX为空,new_dataY不为空情况
#开始递归创建树
def createTree(data_X,data_Y,fea_idx):
y_valNums = np.unique(data_Y) #值去重
if y_valNums.size == 1: #全是一个类别,直接返回该类别
return np.int(data_Y[0])
if data_X.shape[1] == 0: #如果该递归路径下,已经遍历了所有特征,使用多数投票进行分类返回(这里和splitDataByFeature有关)
bestCls,bestCnt = 0,0
for i in y_valNums:
if data_Y[np.where(data_Y==i)].size > bestCnt:
bestCls = i
return bestCls
#可以进行递归了
BestFeature = chooseBestFeature(data_X,data_Y)
my_tree = {fea_idx[BestFeature]:{}}
uniFeaVals = np.unique(data_X[:,BestFeature])
for i in uniFeaVals:
new_dataX,new_dataY,new_feaIdx = splitDataByFeature(data_X,data_Y,fea_idx,BestFeature,i) #获取新的划分子集,注意,因为我们是先获取Y标签,然后才删除X矩阵该特征列。所以会出现data_X为空,data_Y不为空情况
my_tree[fea_idx[BestFeature]][i] = createTree(new_dataX,new_dataY,new_feaIdx)
return my_tree
def classify(inputTree,testVec): #递归查找树,直到找到叶子节点
rootTag = list(inputTree.keys())[0] #获取根节点信息,看先找的哪一个特征 --- 获取索引
FeaVal = inputTree[rootTag] #获取该根节点全部特性值 --- 获取值
for k in FeaVal.keys():
if k == testVec[rootTag]:
if type(inputTree[rootTag][k]) != dict:
return inputTree[rootTag][k]
return classify(inputTree[rootTag][k],testVec)
data_x,data_y,fea_names = createDataSet()
fea_Idx = np.arange(len(fea_names))
data_X,data_Y = np.array(data_x),np.array([data_y]).T
myTree = createTree(data_X,data_Y,fea_Idx)
print(myTree)
testData = np.zeros(len(fea_names))
for i in range(len(fea_names)):
testData[i] = input("{}(0/1)>:".format(fea_names[i]))
print(classify(myTree,testData))
View Code