from numpy import * import operator dataSet = [['长', '粗', '男'], ['短', '粗', '男'], ['短', '粗', '男'], ['长', '细', '女'], ['短', '细', '女'], ['短', '粗', '女'], ['长', '粗', '女'], ['长', '粗', '女']] data_features = ['头发','声音'] #两个特征
#计算信息商 def calEntropy(dataSet): num_dataSet = len(dataSet) labels_count = {} entropy = 0.0 for data in dataSet: label = data[-1] if label not in labels_count.keys(): labels_count[label] = 0 labels_count[label] += 1 for label in labels_count.keys(): prob = float(labels_count[label])/num_dataSet entropy -= prob * log(prob) return entropy
#根据某个列的值做分割 def split_data(dataSet,axis,value): subdataSet = [] for data in dataSet: if data[axis] == value: subdata = data[:axis] subdata.extend(data[axis+1:]) subdataSet.append(subdata) return subdataSet
#计算信息增益 选择最好的特征进行分割 def chooseFeature(dataSet,features): baseEntropy = calEntropy(dataSet) bestFeature = -1 bestFeature_index = -1 gainInfo = 0.0 for i,feat in enumerate(features): entropy = 0.0 feature = [ example[i] for example in dataSet] feature = set(feature) for value in feature: subData = split_data(dataSet,i,value) prob = float(len(subData))/len(dataSet) entropy += prob * calEntropy(subData) gain = baseEntropy - entropy if gain > gainInfo: bestFeature = feat bestFeature_index = i gainInfo = gain return bestFeature_index,bestFeature
#按分类后的类别数量排序 def majorityCnt(classList): class_count = {} for vote in classList: if vote not in class_count.keys(): class_count[vote] = 0 class_count[vote] += 1 sortedclss_count = sorted(class_count.items(),key = operator.itemgetter(1),reverse = True) #print(sortedclss_count) return sortedclss_count[0][0]
#创建树 def creatTree(dataSet,features): labels = [ example[-1] for example in dataSet] if labels.count(labels[0]) == len(labels): #只有一类则停止 return labels[0] if len(dataSet[0]) == 1: return majorityCnt(labels) #只有最后一列了 bestFeature_index,bestFeature = chooseFeature(dataSet,features) feature = [ data[bestFeature_index] for data in dataSet] feature = set(feature) new_label = features[bestFeature_index] del(features[bestFeature_index]) Tree = {new_label:{}} for each in feature: sublabel = features[:] Tree[new_label][each] = creatTree(split_data(dataSet,bestFeature_index,each),sublabel) return Tree Tree = creatTree(dataSet,data_features) print(Tree) #{'声音': {'粗': {'头发': {'长': '女', '短': '男'}}, '细': '女'}}
#这个分类函数有缺陷,当递归调用本函数的时候,如果二层是个字典但字典里面有两个元素的时候,之后考虑第一种情况 #使用决策树进行分类 def classify(tree,label,testVec): #tree 为createTree()函数返回大的决策树 label为特征 testVec为带分类的树 firstFeat = list(tree.keys())[0] #取出tree的第一个键 secondDict = tree[firstFeat] #取出tree第一个键的值 labelIndex = label.index(firstFeat) #得到第一个特征firstFeat在label中的索引 classlabel = None for key in secondDict.keys():#遍历第二个字典的键 if testVec[labelIndex] == key: #第一个特征的测试值与第二个字典相等时 if type(secondDict[key]).__name__ == 'dict':#如果第二个字典的值还是一个个字典时,递归执行函数 classlabel = classify(secondDict[key],label,testVec) else: classlabel = secondDict[key] return classlabel class_ = classify(Tree,['头发','声音'],['短','粗']) print(class_)
def classify1(tree,labels,testdata): classlabel = None for firstFeat in tree.keys(): secondDict = tree[firstFeat] labelIndex = labels.index(firstFeat) for key in secondDict.keys(): if testdata[labelIndex] == key: if type(secondDict[key]).__name__ == 'dict': classlabel = classify1(secondDict[key],labels,testdata) else: classlabel = secondDict[key] return classlabel class_1 = classify1(Tree,['头发','声音'],['短', '细']) print(class_1) print('===========================')
for data in dataSet: test_data = data[:-1] class_ = classify(Tree, ['头发', '声音'], test_data) print(class_) class_1 = classify1(Tree, ['头发', '声音'], test_data) print(class_1) print('------------------------------')