第三章——第一个机器学习项目
一个机器学习项目的步骤:
1)导入数据;
2)概述数据;
3)数据可视化;
4)评估算法;
5)实施预测。
导入类库
from pandas import read_csv
from pandas import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
1)导入数据
filename='iris.data.csv'
names=['separ-length','separ-width','petal-length','petal-width','class']
dataset=read_csv(filename,names=names)
2)概述数据
数据的维度、查看数据自身、统计描述所有的数据特征、数据分类的分布情况。
print('数据维度:行%s,列 %s'% dataset.shape) #显示数据维度
print(dataset.head(10)) #查看数据的前10行
print(dataset.describe()) #统计描述数据的行数、中位值、最大值、最小值、四分位值等
print(dataset.groupby('class').size()) #数据分类分布
3)数据可视化;
#箱线图(盒图) #盒图
dataset.plot(kind='box',subplots=True, layout=(2,2), sharex=False, sharey=False)
pyplot.show()
#直方图
dataset.hist()
pyplot.show()
#散点矩阵图
scatter_matrix(dataset)
pyplot.show()
4)评估算法
1)分离出评估数据集
2)采用10折交叉验证来评估算法模型
3)生成6个不同的模型来预测新数据
4)选择最优模型。
分离出评估数据
#分离数据集
#80%用于训练,20%用于评估
array=dataset.values
X=array[:,0:4]
Y=array[:,4]
validation_size=0.2
seed=7
X_train,X_validation,Y_train,Y_validation=\
train_test_split(X,Y,test_size=validation_size,random_state=seed)
创建模型、10折交叉验证
#算法审查
models={}
models['LR']=LogisticRegression()
models['LDA']=LinearDiscriminantAnalysis()
models['KNN']=KNeighborsClassifier()
models['CART']=DecisionTreeClassifier()
models['NB']=GaussianNB()
models['SVM']=SVC()
#评估算法
results=[]
for key in models:
kfold=KFold(n_splits=10, random_state=seed)
cv_results=cross_val_score(models[key],X_train,Y_train,cv=kfold, scoring='accuracy')
results.append(cv_results)
print('%s: %f (%f)' %(key, cv_results.mean(), cv_results.std()))
选择最优模型
5)实施预测
选取最优的模型进行预测
#使用测试集来评估算法
svm=SVC()
svm.fit(X=X_train,y=Y_train)
predictions= svm.predict(X_validation)
print(accuracy_score(Y_validation,predictions))
print(confusion_matrix(Y_validation,predictions))
print(classification_report(Y_validation,predictions))
完整的python代码:
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 4 09:42:41 2018
@author: np
"""
from pandas import read_csv
from pandas import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
filename='iris.data.csv'
names=['separ-length','separ-width','petal-length','petal-width','class']
dataset=read_csv(filename,names=names)
#分离数据集
#80%用于训练,20%用于评估
array=dataset.values
X=array[:,0:4]
Y=array[:,4]
validation_size=0.2
seed=7
X_train,X_validation,Y_train,Y_validation=\
train_test_split(X,Y,test_size=validation_size,random_state=seed)
#算法审查
models={}
models['LR']=LogisticRegression()
models['LDA']=LinearDiscriminantAnalysis()
models['KNN']=KNeighborsClassifier()
models['CART']=DecisionTreeClassifier()
models['NB']=GaussianNB()
models['SVM']=SVC()
#评估算法
results=[]
for key in models:
kfold=KFold(n_splits=10, random_state=seed)
cv_results=cross_val_score(models[key],X_train,Y_train,cv=kfold, scoring='accuracy')
results.append(cv_results)
print('%s: %f (%f)' %(key, cv_results.mean(), cv_results.std()))
#箱线图比较算法
fig=pyplot.figure()
fig.suptitle("Algorithm Comparision")
ax=fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(models.keys())
pyplot.show()
#使用测试集来评估算法
svm=SVC()
svm.fit(X=X_train,y=Y_train)
predictions= svm.predict(X_validation)
print(accuracy_score(Y_validation,predictions))
print(confusion_matrix(Y_validation,predictions))
print(classification_report(Y_validation,predictions))