读取数据集
我们需要的就是iris中的data和target两个list
import operator
import random
import numpy as np
import sklearn.datasets as sd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
# 读取数据集
iris = sd.load_iris() # 载入数据据
# print(iris['target_names']) # 'target_names': array(['setosa', 'versicolor', 'virginica']
# print(iris[
# 'feature_names']) # 'feature_names':['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
x_data = iris['data']
y_data = iris['target']
features = iris['feature_names']
labels = iris['target_names']
切分、打乱数据集
自动切分
# 自动切分数据集方法
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data)
手动切分打乱
def split_dataset(x_data, y_data, test_size=40):
# 手动打乱数据集
data_size = len(x_data) # 获取行数
id_ = [i for i in range(data_size)] # 列表解析式建立index列表
random.shuffle(id_) # random.shuffle打乱顺序
x_data = x_data[id_] # 数据集调整
y_data = y_data[id_]
# 手动切分数据集方法
x_train = x_data[test_size:] # 40:150为训练集
y_train = y_data[test_size:]
x_test = x_data[:test_size] # 0:40为测试集
y_test = y_data[:test_size]
return x_train, y_train, x_test, y_test
定义KNN函数
def KNN(x_test_, x_train, y_train, K):
# 定义KNN函数
x_test_0 = np.tile(x_test_, (len(x_train), 1)) # 复制x_test[0],用于计算delta
delta_mat = x_train - x_test_0 # 计算差值
delta2 = delta_mat ** 2 # 计算平方
distance2 = [] # 求和
for i in range(len(delta2)):
distance2.append(sum(delta2[i]))
distance = np.sqrt(distance2) # 开方
sorted_distance = distance.argsort()#对distance进行排序
# print(sorted_distance)
dict_ = {}
for i in range(K):#取前K个数据
label_ = y_train[sorted_distance[i]]
dict_[label_] = dict_.get(label_, 0) + 1 # 没有则设为0;有则+1
# sorted_dict = sorted(dict_, key=dict_.__getitem__, reverse=True) # 对字典进行排序
# 或者
sorted_dict = sorted(dict_.items(), key=operator.itemgetter(1), reverse=True) # 返回list[(1, 3), (0, 2)]
return sorted_dict[0][0]
测试
if __name__ == '__main__':
# x_train, y_train, x_test, y_test = split_dataset(x_data, y_data, test_size=40)
prediction = []
for i in range(len(x_test)):
x_test_ = x_test[i]
label = KNN(x_test_, x_train, y_train, K=5)
prediction.append(label)
print(prediction)
print(y_test)
print(classification_report(y_test, prediction)) # 精度报告
print(confusion_matrix(y_test, prediction))
结果:输出精度报告(classification_report),混淆矩阵(confusion_matrix)
precision recall f1-score support
0 1.00 1.00 1.00 12
1 0.82 0.90 0.86 10
2 0.93 0.88 0.90 16
accuracy 0.92 38
macro avg 0.92 0.92 0.92 38
weighted avg 0.92 0.92 0.92 38
[[12 0 0]
[ 0 9 1]
[ 0 2 14]]
遇到 UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. ‘precision’, ‘predicted’, average, warn_for)
遇到这个问题的原因通常是,在预测的标签中缺少实际的标签:如缺少2
# [0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1]
# [2 1 1 0 0 0 0 0 2 0 2 2 2 0 1 2 2 1 2 1 1 1 0 1 1 1 1 1 1 1 2 2 0 0 1 0 1 2 2 0]