决策树学习1

#原代码来自机器学习实战，注释及改动原创
from math import log
import operator
import KNN
def createDataSet():
    dataSet=[[1,1,'yes']
             ,[1,1,'yes'],
             [1,0,'no'],
             [0,1,'no']
             ,[0,1,'no']]
    labels=['no surfacing' ,'flippers']#label for 0 or 1
    return dataSet,labels
def calcShannonEnt(dataSet):
    numEntries=len(dataSet)
    labelCount={}
    #labelCount = [[0]*numEntries]
    #labelCount=range(numEntries)
    for featVec in dataSet:
        currentLabel=str(featVec[-1])
        #print(currentLabel)
        if currentLabel not in labelCount:
            labelCount[currentLabel]=0
        labelCount[currentLabel]+=1
    shannonEnt=0.0
    for key in labelCount:
        prob=float(labelCount[key])/numEntries
        shannonEnt-=prob*log(prob,2)
    return shannonEnt
def splitDataSet(dataSet,axis,value):
    retDataSet=[]
    for featVec in dataSet:
        if featVec[axis]==value:#find the others of list axis ==value
            reduceFeatVec=featVec[:axis]#reduce axis
            reduceFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reduceFeatVec)
    return retDataSet
def chooseBestFeatureToSplit(dataSet):
    numFeatures=len(dataSet[0])-1 #the last is label
    baseEntry=calcShannonEnt(dataSet) #haven't classify the entroy
    bestInfoGain=0.0;bestFeature=-1 #initilize
    for i in range(numFeatures): #visit feature
        featList=[example[i] for example in dataSet] #get the axis i of dataset
        #print(featList)
        uniqueVals=set(featList)#the unqiue value of axis i
        newEntroy=0.0#initilize
        for value in uniqueVals:#calculate the entroy if class as the axis i
            subDataSet=splitDataSet(dataSet,i,value)# sub set
            prob=len(subDataSet)/float(len(dataSet))# possibility to get axis i value
            newEntroy+=prob*calcShannonEnt(subDataSet)
        infoGain=baseEntry-newEntroy# gain
        if(infoGain>bestInfoGain):#find best
            bestInfoGain=infoGain
            bestFeature=i
    return bestFeature
def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys(): classCount[vote]=0
        classCount[vote]+=1#if feature is used out but still have different class using this using the most be the class
    sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]
def createTree(dataSet,labels):
    classList=[example[-1] for example in dataSet]#the last line of data
    if classList.count(classList[0])==len(classList):#if all is same class return
        return classList[0]
    if len(dataSet[0])==1:#if no label find the most
        return majorityCnt(classList)
    bestFeature=chooseBestFeatureToSplit(dataSet)#find the feature to justify
    bestFeatureLabel=labels[bestFeature]#get the meaning of this feature
    myTree={bestFeatureLabel:{}}#initilize a void 2 dimension dictrionary
    del(labels[bestFeature]) #delete thie label
    featVals=[example[bestFeature] for example in dataSet]#the different value of this feature
    uniqueVals=set(featVals)#unique because the value in set is unqiue
    for val in uniqueVals:
        subLabel=labels[:]#sub label
        myTree[bestFeatureLabel][val]=createTree(splitDataSet(dataSet,bestFeature,val),subLabel)#recursion
    return myTree
if __name__=='__main__':
    #x,y=KNN.file2matrix("dataset.txt" ,4)
    #print(x)
    mydat,labels=createDataSet()
    #print(mydat)
   # print(mydat[:,-1])
    print(createTree(mydat,labels))
    # print(calcShannonEnt(mydat))
   # print(chooseBestFeatureToSplit(mydat))#ctrl /
    # print(mydat[0][-1])
    # print(splitDataSet(mydat,0,mydat[0][0]))
    # mydat[0][-1]="maybe"
    # print(mydat)
    # print(calcShannonEnt(mydat))
猜你喜欢