#原代码来自机器学习实战,注释及改动原创
from math import log
import operator
import KNN
def createDataSet():
dataSet=[[1,1,'yes']
,[1,1,'yes'],
[1,0,'no'],
[0,1,'no']
,[0,1,'no']]
labels=['no surfacing' ,'flippers']#label for 0 or 1
return dataSet,labels
def calcShannonEnt(dataSet):
numEntries=len(dataSet)
labelCount={}
#labelCount = [[0]*numEntries]
#labelCount=range(numEntries)
for featVec in dataSet:
currentLabel=str(featVec[-1])
#print(currentLabel)
if currentLabel not in labelCount:
labelCount[currentLabel]=0
labelCount[currentLabel]+=1
shannonEnt=0.0
for key in labelCount:
prob=float(labelCount[key])/numEntries
shannonEnt-=prob*log(prob,2)
return shannonEnt
def splitDataSet(dataSet,axis,value):
retDataSet=[]
for featVec in dataSet:
if featVec[axis]==value:#find the others of list axis ==value
reduceFeatVec=featVec[:axis]#reduce axis
reduceFeatVec.extend(featVec[axis+1:])
retDataSet.append(reduceFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures=len(dataSet[0])-1 #the last is label
baseEntry=calcShannonEnt(dataSet) #haven't classify the entroy
bestInfoGain=0.0;bestFeature=-1 #initilize
for i in range(numFeatures): #visit feature
featList=[example[i] for example in dataSet] #get the axis i of dataset
#print(featList)
uniqueVals=set(featList)#the unqiue value of axis i
newEntroy=0.0#initilize
for value in uniqueVals:#calculate the entroy if class as the axis i
subDataSet=splitDataSet(dataSet,i,value)# sub set
prob=len(subDataSet)/float(len(dataSet))# possibility to get axis i value
newEntroy+=prob*calcShannonEnt(subDataSet)
infoGain=baseEntry-newEntroy# gain
if(infoGain>bestInfoGain):#find best
bestInfoGain=infoGain
bestFeature=i
return bestFeature
def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys(): classCount[vote]=0
classCount[vote]+=1#if feature is used out but still have different class using this using the most be the class
sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet,labels):
classList=[example[-1] for example in dataSet]#the last line of data
if classList.count(classList[0])==len(classList):#if all is same class return
return classList[0]
if len(dataSet[0])==1:#if no label find the most
return majorityCnt(classList)
bestFeature=chooseBestFeatureToSplit(dataSet)#find the feature to justify
bestFeatureLabel=labels[bestFeature]#get the meaning of this feature
myTree={bestFeatureLabel:{}}#initilize a void 2 dimension dictrionary
del(labels[bestFeature]) #delete thie label
featVals=[example[bestFeature] for example in dataSet]#the different value of this feature
uniqueVals=set(featVals)#unique because the value in set is unqiue
for val in uniqueVals:
subLabel=labels[:]#sub label
myTree[bestFeatureLabel][val]=createTree(splitDataSet(dataSet,bestFeature,val),subLabel)#recursion
return myTree
if __name__=='__main__':
#x,y=KNN.file2matrix("dataset.txt" ,4)
#print(x)
mydat,labels=createDataSet()
#print(mydat)
# print(mydat[:,-1])
print(createTree(mydat,labels))
# print(calcShannonEnt(mydat))
# print(chooseBestFeatureToSplit(mydat))#ctrl /
# print(mydat[0][-1])
# print(splitDataSet(mydat,0,mydat[0][0]))
# mydat[0][-1]="maybe"
# print(mydat)
# print(calcShannonEnt(mydat))