from math import log2
from collections import Counter
import numpy as np
1.1 构建决策树
1.1.1信息增益
def ent(dataset):
dataset=np.array(dataset)
m=dataset.shape[0]
label_counts=Counter(dataset[:,-1])
#计算信息熵
#prob=v/m
return sum(-(v/m)*log2(v/m) for v in label_counts.values())
def split_dataset(dataset,axis,value):
#np.array()可将数据类型转为str
dataset=np.array(dataset)
idx=np.where(dataset[:,axis]==value)
return np.delete(dataset[idx],axis,axis=1)
my_data,labels=create_dataset()
split_dataset(my_data,0,'1')
split_dataset(my_data,0,'0')
def choose_best_feature2split(dataset):
dataset=np.array(dataset)
m,n=dataset.shape
base_ent=ent(dataset)
best_info_gain=0.0
for i in range(n-1):
feature_counts=Counter(dataset[:,i])
feature_prob={k:v/m for (k,v) in feature_counts.items()}
#计算每个特征的熵
feature_ent={k:ent(split_dataset(dataset,i,k)) for (k,v) in feature_counts.items()}
feature_cond_ent=sum(feature_prob[k]*feature_ent[k] for k in feature_counts.keys())
info_gain=base_ent-feature_cond_ent
#与初始熵比较
if info_gain>best_info_gain:
best_info_gain=info_gain
best_feature=i
return best_feature
choose_best_feature2split(my_data)
1.1.3 递归构建决策树
def majority_cnt(class_list):
class_count=Counter(class_list)
return max(class_count.items(),key=lambda x:x[1])[0]
def create_tree(dataset,labels):
dataset=np.array(dataset)
class_list=dataset[:,-1]
if len(set(class_list))==1:
return class_list[0]
if len(dataset[0])==1:
return majority_cnt(class_list)
best_feature=choose_best_feature2split(dataset)
best_feature_label=labels[best_feature]
my_tree={best_feature_label:{}}
del (labels[best_feature])
feat_values=dataset[:,best_feature]
unique_vals=set(feat_values)
for val in unique_vals:
sublabels=labels.copy()
my_tree[best_feature_label][val]=create_tree(split_dataset(dataset,best_feature,val),sublabels)
return my_tree
my_tree=create_tree(my_data,labels)
1.2 测试、存储分类器
1.2.1 测试
def predict(input_tree,feature_labels,test_vec):
first_feature=tuple(input_tree.keys())[0]
sub_tree=input_tree[first_feature]
feature_index=feature_labels.index(first_feature)
for key in sub_tree.keys():
if test_vec[feature_index]==key:
if isinstance(sub_tree[key],dict):
return predict(sub_tree[key],feature_labels,test_vec)
else:
return sub_tree[key]
my_data,labels=create_dataset()
predict(my_tree,labels,['1','0'])
predict(my_tree,labels,['1','1'])
fr=open('lenses.txt')
lenses=[inst.strip().split('\t') for inst in fr.readlines()]
lenses_labels=['age','prescript','astigmatic','tear_rate']
create_tree(lenses,lenses_labels)