优秀相关博客参考链接:http://www.cnblogs.com/pinard/p/6053344.html
一、基础知识——信息熵与条件信息熵
二、决策树的定义与直观理解
三、决策树类库介绍——DecisionTreeClassifier 和 DecisionTreeRegressor
-
#!/usr/bin/env python
-
# -*- coding:utf-8 -*-
-
# Author:ZhengzhengLiu
-
-
#鸢尾花数据分类——决策树
-
-
from sklearn
import tree
#决策树
-
from sklearn.tree
import DecisionTreeClassifier
#决策分类树
-
from sklearn.model_selection
import train_test_split
-
from sklearn.model_selection
import GridSearchCV
#网格搜索交叉验证
-
from sklearn.pipeline
import Pipeline
#管道
-
from sklearn.preprocessing
import MinMaxScaler
#数据归一化
-
from sklearn.feature_selection
import SelectKBest
#特征选择
-
from sklearn.feature_selection
import chi2
#卡方统计量
-
from sklearn.decomposition
import PCA
#主成分分析
-
import numpy
as np
-
import pandas
as pd
-
import matplotlib
as mpl
-
import matplotlib.pyplot
as plt
-
-
#解决中文显示问题
-
mpl.rcParams[
‘font.sans-serif’]=[
u’simHei’]
-
mpl.rcParams[
‘axes.unicode_minus’]=
False
-
-
#导入数据
-
path =
“./datas/iris.data”
-
data = pd.read_csv(path,header=
None)
-
-
iris_feature_E =
“sepal length”,
“sepal width”,
“petal length”,
“petal width”
-
iris_feature_C =
u”花萼长度”,
u”花萼宽度”,
u”花瓣长度”,
u”花瓣宽度”
-
iris_class =
“Iris-setosa”,
“Iris-versicolor”,
“Iris-virginica”
-
-
#数据分割
-
x = data[np.arange(
0,
4)]
#获取x变量
-
#x = data[list(range(4))] #与上面一句等价
-
#print(x.head())
-
y = pd.Categorical(data[
4]).codes
#Categorical:编码包含大量重复文本的数据,codes把数据y转换成分类型的0,1,2
-
print(
“样本总数:%d;特征属性数目:%d” %x.shape)
-
print(y)
-
-
#划分训练集与测试集
-
x_train1, x_test1, y_train1, y_test1 = train_test_split(x,y,test_size=
0.2,random_state=
14)
-
x_train, x_test, y_train, y_test = x_train1, x_test1, y_train1, y_test1
-
print(
“训练数据集样本总数:%d;测试数据集样本总数:%d” %(x_train.shape[
0],x_test.shape[
0]))
-
-
#对数据集进行标准化
-
ss = MinMaxScaler()
-
x_train = ss.fit_transform(x_train,y_train)
-
x_test = ss.transform(x_test)
-
print(
“原始数据各个特征的调整最小值:”,ss.min_)
-
print(
“原始数据各个特征的缩放数据值:”,ss.scale_)
-
-
#特征选择:从已有的特征属性中选择出影响目标最大的特征属性
-
#常用方法:{分类:F统计量、卡方系数、互信息mutual_info_classif
-
# 连续:皮尔逊相关系数、F统计量、互信息mutual_info_classif}
-
#SelectKBest(卡方系数)
-
ch2 = SelectKBest(chi2,k=
3)
#当前案例中,用SelectKBest方法从四个原始特征属性中选择出最能影响目标的3个特征属性
-
# k 默认为10,指定后会返回想要的特征个数
-
x_train = ch2.fit_transform(x_train,y_train)
#训练并转换
-
x_test = ch2.transform(x_test)
#转换
-
select_name_index = ch2.get_support(indices=
True)
-
print(
“对类别判别影响最大的三个特征属性分别是:”,ch2.get_support(indices=
False))
-
print(select_name_index)
-
-
#降维:对于数据而言,如果特征属性比较多,在构建过程中会比较复杂,
-
# 这时将多维(高维)降到低维空间中
-
#常用的降维方法:PCA 主成分分析(无监督);人脸识别通常先做一次PCA
-
# LDA 线性判别分析(有监督),类内方差最小
-
-
pca = PCA(n_components=
2)
#构建一个PCA对象,设置最终维度为2维
-
#这里为了后边画图方便,将数据维度设置为 2,一般用默认不设置就可以
-
x_train = pca.fit_transform(x_train)
-
x_test = pca.transform(x_test)
-
-
#模型构建
-
model = DecisionTreeClassifier(criterion=
“entropy”,random_state=
0)
-
#模型训练
-
model.fit(x_train,y_train)
-
#模型预测
-
y_test_hat = model.predict(x_test)
-
-
#利用数据可视化软件Graphviz打印出决策树
-
#from sklearn.externals.six import StringIO
-
#with open(“iris.dot”) as f:
-
#f = tree.export_graphviz(model,out_file=f)
-
-
print(
“Score:”,model.score(x_test,y_test))
-
print(
“Classes:”,model.classes_)
-
-
N =
100
-
x1_min = np.min((x_train.T[
0].min(),x_test.T[
0].min()))
-
x1_max = np.max((x_train.T[
0].max(),x_test.T[
0].max()))
-
x2_min = np.min((x_train.T[
1].min(),x_test.T[
1].min()))
-
x2_max = np.max((x_train.T[
1].max(),x_test.T[
1].max()))
-
-
t1 = np.linspace(x1_min,x1_max,N)
-
t2 = np.linspace(x2_min,x2_max,N)
-
x1,x2 = np.meshgrid(t1,t2)
#生成网格采样点
-
x_show = np.dstack((x1.flat,x2.flat))[
0]
-
y_show_hat = model.predict(x_show)
-
y_show_hat = y_show_hat.reshape(x1.shape)
-
print(y_show_hat.shape)
-
print(y_show_hat[
0])
-
-
#画图
-
plt_light = mpl.colors.ListedColormap([
‘#A0FFA0’,
‘#FFA0A0’,
‘#A0A0FF’])
-
plt_dark = mpl.colors.ListedColormap([
‘g’,
‘r’,
‘b’])
-
plt.figure(facecolor=
“w”)
-
plt.pcolormesh(x1,x2,y_show_hat,cmap=plt_light)
-
plt.scatter(x_test.T[
0],x_test.T[
1],c=y_test.ravel(),edgecolors=
“k”,
-
s=
150,zorder=
10,cmap=plt_dark,marker=
“*”)
#测试数据
-
plt.scatter(x_train.T[
0],x_train.T[
1],c=y_train.ravel(),edgecolors=
“k”,
-
s=
40,cmap=plt_dark)
#全部数据
-
plt.xlabel(
u”特征属性1”,fontsize=
15)
-
plt.ylabel(
u”特征属性2”,fontsize=
15)
-
plt.xlim(x1_min,x1_max)
-
plt.ylim(x2_min,x2_max)
-
plt.grid(
True)
-
plt.title(
u”鸢尾花数据的决策树分类”,fontsize=
18)
-
plt.savefig(
“鸢尾花数据的决策树分类.png”)
-
plt.show()
-
-
#参数优化
-
pipe = Pipeline([
-
(
‘mms’, MinMaxScaler()),
-
(
‘skb’, SelectKBest(chi2)),
-
(
‘pca’, PCA()),
-
(
‘decision’, DecisionTreeClassifier())
-
])
-
-
# 参数
-
parameters = {
-
“skb__k”: [
1,
2,
3,
4],
-
“pca__n_components”: [
0.5,
1.0],
-
“decision__criterion”: [
“gini”,
“entropy”],
-
“decision__max_depth”: [
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15]
-
}
-
-
x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1
-
-
gscv = GridSearchCV(pipe, param_grid=parameters)
-
-
gscv.fit(x_train2, y_train2)
-
-
print(
“最优参数列表:”,gscv.best_params_)
-
print (
“score值:”,gscv.best_score_)
-
-
y_test_hat2 = gscv.predict(x_test2)
-
-
mms_best = MinMaxScaler()
-
skb_best = SelectKBest(chi2,k=
2)
-
pca_best = PCA(n_components=
0.5)
-
decision3 = DecisionTreeClassifier(criterion=
“gini”,max_depth=
2)
-
x_train3, x_test3, y_train3, y_test3 = x_train1, x_test1, y_train1, y_test1
-
x_train3 = pca_best.fit_transform(skb_best.fit_transform(mms_best.fit_transform(x_train3,y_train3),y_train3))
-
x_test3 = pca_best.transform(skb_best.transform(mms_best.transform(x_test3)))
-
decision3.fit(x_train3,y_train3)
-
print(
“正确率:”,decision3.score(x_test3,y_test3))
-
-
x_train4, x_test4, y_train4, y_test4 = train_test_split(x.iloc[:, :
2], y, train_size=
0.7, random_state=
14)
-
-
depths = np.arange(
1,
15)
-
err_list = []
-
for d
in depths:
-
clf = DecisionTreeClassifier(criterion=
‘gini’, max_depth=d)
-
clf.fit(x_train4, y_train4)
-
-
score = clf.score(x_test4, y_test4)
-
err =
1 - score
-
err_list.append(err)
-
print(
“%d深度,正确率%.5f” % (d, score))
-
-
-
## 画图
-
plt.figure(facecolor=
‘w’)
-
plt.plot(depths, err_list,
‘ro-‘, lw=
3)
-
plt.xlabel(
u’决策树深度’, fontsize=
16)
-
plt.ylabel(
u’错误率’, fontsize=
16)
-
plt.grid(
True)
-
plt.title(
u’决策树层次太多导致的拟合问题(欠拟合和过拟合)’, fontsize=
18)
-
plt.savefig(
“决策树层次太多导致的拟合问题(欠拟合和过拟合).png”)
-
plt.show()
-
-
#运行结果:
-
样本总数:
150;特征属性数目:
4
-
[
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
-
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
-
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
-
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
-
2
2]
-
训练数据集样本总数:
120;测试数据集样本总数:
30
-
原始数据各个特征的调整最小值: [
-1.19444444
-0.83333333
-0.18965517
-0.04166667]
-
原始数据各个特征的缩放数据值: [
0.27777778
0.41666667
0.17241379
0.41666667]
-
对类别判别影响最大的三个特征属性分别是: [
True
False
True
True]
-
[
0
2
3]
-
Score:
0.966666666667
-
Classes: [
0
1
2]
-
(
100,
100)
-
[
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
-
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
-
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2]
-
最优参数列表: {
‘skb__k’:
2,
‘decision__max_depth’:
2,
‘pca__n_components’:
0.5,
‘decision__criterion’:
‘gini’}
-
score值:
0.933333333333
-
正确率:
1.0
-
1深度,正确率
0.55556
-
2深度,正确率
0.73333
-
3深度,正确率
0.77778
-
4深度,正确率
0.73333
-
5深度,正确率
0.68889
-
6深度,正确率
0.68889
-
7深度,正确率
0.68889
-
8深度,正确率
0.66667
-
9深度,正确率
0.66667
-
10深度,正确率
0.66667
-
11深度,正确率
0.66667
-
12深度,正确率
0.66667
-
13深度,正确率
0.66667
-
14深度,正确率
0.66667