KNN算法主要有两个步骤,计算样本之间的距离和找邻居判定自己的类别。计算距离之前先将特征进行标准化处理。
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn
import numpy as np
import pandas as pd
#从sklearn.datasets导入鸢尾花数据集
dataset = load_iris()
data = dataset['data']
target = dataset['target']
#将data转化成DataFrame的格式,进行数值标准化
data = pd.DataFrame(data)
data = data.transform(lambda x:(x-min(x)) / (max(x) -min(x)))
#训练集和测试集的划分
traindata,testdata,traintarget,testtarget = train_test_split(data,target,test_size = 0.2) #test_size 为测试集的比例
#KNN算法
#####################计算距离
#计算距离,用的是欧式距离
dist = np.zeros([traindata.shape[0],testdata.shape[0]])
for i in range(traindata.shape[0]):
for j in range(testdata.shape[0]):
dist[i,j] = sum((traindata.iloc[i] - testdata.iloc[j]) ** 2)**0.5
#####################找邻居,判定自己的所属类别
k = 6 #邻居个数
max_index = []
for j in range(testdata.shape[0]):
list1 = dist[:,j]
list2 = sorted(list1)
max_num = list2[:k]
max_index.append([y for y,i in enumerate(list1) if i in max_num])
#分类
pre = [pd.value_counts(traintarget[max_index[i]]).idxmax() for i in range(testdata.shape[0])]
#查看结果
print(pre == testtarget)