ID3决策树以及python实现
ID3决策树是以信息增益作为标准来划分子节点的决策树,下面用python实现决策树
实现代码
- getEntD计算信息增益
#ID3 Decision Tree
from math import log
def getEntD(dataset):#Ent(D)
totalNum=len(dataset)
classCount={}
for data in dataset:
if data[-1] in classCount :
classCount[data[-1]]=classCount[data[-1]]+1
else:
classCount[data[-1]]=1
rel=0
for value in classCount.values():
p=float(value)/totalNum
rel=rel-p*log(p,2)
return rel
- 根据特征的值划分数据集
def splitDataset(dataset,i,value):#split the dataset by the value of the feature
splitRel = []
for data in dataset:
tmp=data.copy()
if tmp[i]==value:
del tmp[i]
splitRel.append(tmp)
return splitRel
- 选择最佳的分划特征
def getBestFeature(dataset):
featureNum=len(dataset[0])-1
totalNum=len(dataset)
rel=-1
#the initial Ent(D)
EntD=getEntD(dataset)
tmpMax=0
cList=[]
for i in range(0,featureNum):
for data in dataset:
cList.append(data[i])
cSet=set(cList)
newEntD = 0
for c in cSet:
splitRel=splitDataset(dataset,i,c)
p=len(splitRel)/float(totalNum)
newEntD=newEntD+p*getEntD(splitRel)
if EntD-newEntD > tmpMax:
tmpMax=EntD-newEntD
rel=i
return rel
- 返回数据集里面数量最多的类
def getMajorityClass(dataset):
classCount = {}
for data in dataset:
if data[-1] in classCount:
classCount[data[-1]] = classCount[data[-1]] + 1
else:
classCount[data[-1]] = 1
classCount = sorted(classCount.items(), key=lambda x: x[1], reverse=True)
return classCount
- 树的结构以字典的方式表示,以递归的方式生成决策树
#create the tree using the dictionary
def getTree(dataset,label):
labelTmp=label.copy()
cnt=0
total=len(dataset)
l=dataset[0][-1]
for data in dataset:
if data[-1]==l:
cnt=cnt+1
#all the example belongs to the same class
if cnt==total:
return l
#all the features finish
if len(label)==0 or len(dataset[0])==1:
classCount={}
for data in dataset:
if data[-1] in classCount:
classCount[data[-1]]=classCount[data[-1]]+1
else:
classCount[data[-1]]=1
classCount=sorted(classCount.items(),key=lambda x:x[1],reverse=True)
return classCount[0][0]
bestFeature=getBestFeature(dataset)
bestLabel=labelTmp[bestFeature]
del labelTmp[bestFeature]
dictRel={bestLabel:{}}
cList=[]
for data in dataset:
cList.append(data[bestFeature])
cSet=set(cList)
for c in cSet:
splitD=splitDataset(dataset, bestFeature, c)
dictRel[bestLabel][c]=getTree(splitD,labelTmp)
return dictRel
预剪枝
- 预剪枝降低了过拟合的风险和训练以及测试时间的开销,但产生了欠拟合的风险。下面给出预剪枝的实现,在原来函数的基础上需要增加几个函数。
- 测试是否应该进行剪枝,若是,返回True,否则返回False。
def testPruning(dataset,bestFeature):
#Accuracy before
totalNum=len(dataset)
if totalNum==0:
return True
classCount=getMajorityClass(dataset)
currentJudge=classCount[0][0]
correct=0
cList=[]
for data in dataset:
cList.append(data[bestFeature])
if data[-1]==currentJudge:
correct=correct+1
initialAccuracy=correct/float(totalNum)
#Accuracy after
cSet=set(cList)
correct=0
for c in cSet:
datasetTmp=splitDataset(dataset,bestFeature,c)
currentJudge=getMajorityClass(datasetTmp)[0][0]
for data in datasetTmp:
if data[-1]==currentJudge:
correct=correct+1
currentAccuracy=correct/float(totalNum)
if currentAccuracy >= initialAccuracy:
return True
else:
return False
- 生成决策树
#prune(pre) the Decision Tree
def getTreeAfterPruning(dataset,testDataset,label):
cnt=0
total=len(dataset)
labelTmp=label.copy()
l=dataset[0][-1]
for data in dataset:
if data[-1]==l:
cnt=cnt+1
#all the example belongs to the same class
if cnt==total:
return l
#all the features finish
if len(label)==0 or len(dataset[0])==1:
classCount=getMajorityClass(dataset)
return classCount[0][0]
bestFeature=getBestFeature(dataset)
bestLabel=labelTmp[bestFeature]
if testPruning(testDataset,bestFeature)==True:
del labelTmp[bestFeature]
dictRel={bestLabel:{}}
cList = []
for data in dataset:
cList.append(data[bestFeature])
cSet = set(cList)
for c in cSet:
splitD = splitDataset(dataset, bestFeature, c)
splitDTest = splitDataset(testDataset, bestFeature, c)
dictRel[bestLabel][c] = getTreeAfterPruning(splitD,splitDTest,labelTmp)
return dictRel
else:
classCount=getMajorityClass(dataset)
return classCount[0][0]
进行分类
- 用的是西瓜数据集,输出决策树、准确率
- 计算准确率
def getAccuracy(predictRel,dataset):
totalNum=len(dataset)
correct=0
for i in range(0,totalNum):
if predictRel[i]==dataset[i][-1]:
correct=correct+1
return correct/float(totalNum)
- 用生成的决策树进行分类
def getClass(dataset,label,dictRel):
predictRel=[]
for data in dataset:
dicttmp = dictRel
while dicttmp!='是' and dicttmp!='否':
for key in dicttmp:
featureName=key
fNum=label.index(featureName)
dicttmp=dicttmp[featureName]
dicttmp=dicttmp[data[fNum]]
predictRel.append(dicttmp)
return predictRel
- 主函数
if __name__=="__main__":
x=[['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'], ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是'],
['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'], ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是'],
['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'], ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '是'],
['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '是'], ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '是'],
['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '否'], ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '否'],
['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '否'], ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '否'],
['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '否'], ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '否'],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '否'], ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '否'],
['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '否']]
y=['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']
total=len(x)
testSize=int(0.2*total)
label=y
dataset=[]
testDataset=[]
for i in range(0,total):
if i<testSize:
testDataset.append(x[i])
else:
dataset.append(x[i])
dictLabel=label.copy()
initialLabel=label.copy()
dictRel=getTree(dataset,dictLabel)
print(dictRel)
predictRel=getClass(testDataset,initialLabel,dictRel)
print(getAccuracy(predictRel, testDataset))
dictLabel = label.copy()
initialLabel = label.copy()
dictRel_1=getTreeAfterPruning(dataset, testDataset, dictLabel)
print(dictRel_1)
predictRel=getClass(testDataset,initialLabel,dictRel_1)
print(getAccuracy(predictRel,testDataset))
结果
- 上面是未进行预剪枝的结果,下面是进行预剪枝的结果
- 训练集是数据集总数的80%
{'纹理': {'稍糊': {'触感': {'软粘': '是', '硬滑': '否'}}, '清晰': {'根蒂': {'蜷缩': '是', '硬挺': '否', '稍蜷': {'色泽': {'青绿': '是', '乌黑': {'触感': {'硬滑': '是', '软粘': '否'}}}}}}, '模糊': '否'}}
1.0
{'纹理': {'稍糊': {'触感': {'软粘': '是', '硬滑': '否'}}, '清晰': {'根蒂': {'蜷缩': '是', '硬挺': '否', '稍蜷': {'色泽': {'青绿': '是', '乌黑': {'触感': {'硬滑': '是', '软粘': '否'}}}}}}, '模糊': '否'}}
1.0
- 训练集是数据集总数的75%
{'敲声': {'浊响': {'纹理': {'清晰': {'色泽': {'乌黑': {'触感': {'硬滑': '是', '软粘': '否'}}, '浅白': '是', '青绿': '是'}}, '稍糊': {'色泽': {'乌黑': '是', '青绿': '否'}}, '模糊': '否'}}, '清脆': '否', '沉闷': '否'}}
0.5
{'敲声': {'浊响': {'纹理': {'清晰': {'色泽': {'乌黑': {'触感': {'硬滑': '是', '软粘': '否'}}, '浅白': '是', '青绿': '是'}}, '稍糊': {'色泽': {'乌黑': '是', '青绿': '否'}}, '模糊': '否'}}, '清脆': '否', '沉闷': '否'}}
0.5