机器学习(周志华) 西瓜书 第三章课后习题3.4—— Python实现
个人原创,禁止转载——Zetrue_Li
数据获取
UCI数据官网:http://archive.ics.uci.edu/ml/index.php
选取最受欢迎的Iris数据集:下载链接 http://archive.ics.uci.edu/ml/machine-learning-databases/iris/
Python代码
本题调用了3.3题的程序,传送门:https://blog.csdn.net/weixin_37922777/article/details/88625728
# -*- coding: utf-8 -*-
# 调用3.3题的python实现程序
import Chap3_3
import numpy as np
import pandas as pd
def loadData(filename):
names = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class']
dataSet = pd.read_csv(filename, names=names)
dataSet['b'] = 1
return dataSet
def processData(dataSet, n=10):
values = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
classifications = []
gaps = []
for value in values:
temp = dataSet.loc[dataSet['class']==value]
classifications.append(temp)
gap = temp.shape[0]//n
gaps.append(gap)
D = [None for _ in range(n)]
for a in range(n):
for gap, classification in zip(gaps, classifications):
begin = a * gap
#print(classification[begin:begin+gap])
if type(D[a]).__name__ == 'NoneType':
D[a] = classification[begin:begin+gap]
else:
D[a] = D[a].append(classification[begin:begin+gap])
#print(type(D[a]))
#print(classification[begin:begin+gap])
return D
def judge_function(D_train, D_test):
# values = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
# 0: not Iris-setosa 1: Iris-setosa
x_train = np.array(D_train[['sepal length', 'sepal width', 'petal length', 'petal width', 'b']])
y_train = np.array(D_train[['class']].replace(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], [1, 0, 0]))
beta = Chap3_3.run(x_train, y_train)
x_test = np.array(D_test[['sepal length', 'sepal width', 'petal length', 'petal width', 'b']])
y_test = np.array(D_test[['class']].replace(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], [1, 0, 0]))
accuracy = 0
for xi, yi in zip(x_test, y_test):
p1 = Chap3_3.p1_function(xi, beta)
judge = 0 if p1 < 0.5 else 1
# print(p1, yi[0], judge)
accuracy += (judge == yi[0])
return accuracy
def cross_validation(dataSet):
# 10折交差验证
n = 10
# 预处理数据
D = processData(dataSet, n)
average = 0
for a in range(n):
D_train, D_test = None, D[a]
for b in range(n):
if a != b:
if type(D_train).__name__ == 'NoneType':
D_train = D[b]
else:
D_train = D_train.append(D[b])
accuracy = judge_function(D_train, D_test)
# print(accuracy)
average += accuracy
error = 1 - average / dataSet.shape[0]
return error
def leave_one_out(dataSet):
average, k = 0, dataSet.shape[0]
for a in range(k):
D_train = dataSet[0:a].append(dataSet[a+1:])
D_test = dataSet[a:a+1]
accuracy = judge_function(D_train, D_test)
average += accuracy
error = 1 - average / dataSet.shape[0]
return error
if __name__=="__main__":
# 读取数据
filename = 'UCI/iris/iris.data'
dataSet = loadData(filename)
# 10折交差验证
error1 = cross_validation(dataSet)
print('Cross validation:', error1*100, '%')
# 留一法验证
error2 = leave_one_out(dataSet)
print('Leave one out:', error2*100, '%')