KNN 分类红酒数据集
文章目录
概述
使用 KNN(K 近邻)算法,对红酒(wine)数据集进行分类
使用了 Scikit-Learn 库进行划分数据集、计算准确度等辅助阶段,核心算法使用 NumPy 进行实现,可供参考学习
代码
import
导入必要的组件,包括 numpy、pandas、sklearn 等库
from collections import Counter
from typing import Optional
import numpy as np
from pandas import DataFrame
from sklearn.datasets import load_wine # 加载红酒数据集
from sklearn.metrics import accuracy_score # 计算准确度
from sklearn.metrics.pairwise import euclidean_distances # 计算欧氏距离
from sklearn.model_selection import train_test_split # 划分数据集
from sklearn.preprocessing import MinMaxScaler
数据预处理
- 数据集划分
- 归一化(极大提升准确率)
# 导入 sklearn 提供的 wine 数据集
wine = load_wine(return_X_y=True, as_frame=True)
data: DataFrame = wine[0]
target: DataFrame = wine[1]
# 划分训练集与测试集(2:1)
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.33)
# 对各个属性进行“归一化”处理
scalar = MinMaxScaler()
X_train = scalar.fit_transform(X_train)
X_test = scalar.fit_transform(X_test)
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()
定义模型
class KNN:
def __init__(self, k: int):
self.k: int = k
self.data: Optional[np.ndarray] = None
self.target: Optional[np.ndarray] = None
def fit(self, data_: np.ndarray, target_: np.ndarray) -> None:
self.data = data_
self.target = target_
def predict(self, data_: np.ndarray) -> int:
# 最小的 k 个
distances = np.array([self.euclidean(row, data_) for row in self.data]).argsort()[:self.k]
counter = Counter([self.target[index] for _, index in enumerate(distances)])
return counter.most_common(1)[0][0]
# 欧式距离
@staticmethod
def euclidean(x1: np.ndarray, x2: np.ndarray) -> float:
return euclidean_distances(
np.reshape(x1, (1, -1,)),
np.reshape(x2, (1, -1,)),
)[0, 0]
创建 KNN 模型实例
K: int = 4
knn: KNN = KNN(k=K)
训练
knn.fit(X_train, y_train)
测试
使用 Accuracy 作为精度评判标准,已达到的最高正确率:0.9830508474576272。
predicts: list = []
targets: list = []
for idx, (X, y) in enumerate(zip(X_test, y_test)):
predicts.append(knn.predict(X))
targets.append(y)
accuracy_score(predicts, targets)