经典案例
利用决策树,计算香农熵
'''
Created on Oct 12, 2010
Decision Tree Source Code for Machine Learning in Action Ch. 3
@author: Peter Harrington
'''
from math import log
import operator
def createDataSet(): #数据集
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing','flippers'] #标签 不能浮出水面、脚蹼有无
#change to discrete values
return dataSet, labels #??还能return多个值
def calcShannonEnt(dataSet): #计算给定数据集的熵
numEntries = len(dataSet) #返回数据集的行数
labelCounts = {} #保存每个标签(Label)出现次数的字典
for featVec in dataSet: #对每组特征向量进行统计
currentLabel = featVec[-1] #提取标签(Label)信息 就是最后一个yes\no
if currentLabel not in list(labelCounts.keys()): labelCounts[currentLabel] = 0 #如果标签(Label)没有放入统计次数的字典,添加进去
labelCounts[currentLabel] += 1 #Label计数
shannonEnt = 0.0 #经验熵(香农熵) 设置熵0.0?
for key in labelCounts: #计算香农熵
prob = float(labelCounts[key])/numEntries #选择该标签(Label)的概率
shannonEnt -= prob * log(prob,2) #利用公式计算
return shannonEnt #返回经验熵(香农熵)
if __name__ == "__main__":
myDat, labels = createDataSet()
print(myDat)
print(calcShannonEnt(myDat))
结果: