最基础的分类算法-KNN

1.K近邻算法基础

import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from collections import Counter

"""输入原始数据"""
raw_data_X=[[3.39,2.33],
            [3.11,1.78],
            [1.34,3.36],
            [3.52,4.67],
            [2.28,2.86],
            [7.42,4.68],
            [5.74,3.53],
            [9.17,2.51],
            [7.79,3.42],
            [7.93,0.79]
           ]
raw_data_y=[0,0,1,0,0,1,1,0,1,1]

"""将原始数据转换成numpy下的矩阵"""
X_train=np.array(raw_data_X)
y_train=np.array(raw_data_y)
x=np.array([8.09,3.36])

"""数据可视化"""
plt.scatter(X_train[y_train==0,0],X_train[y_train==0,1],color='r')
plt.scatter(X_train[y_train==1,0],X_train[y_train==1,1],color='g')
plt.scatter(x[0],x[1],color='b')
plt.show()

"""计算距离，并对生成的距离数组索引排序保存在nearest中"""
distances=[sqrt(np.sum((x_train-x)**2))for x_train in X_train ]
print(distances)
nearest=np.argsort(distances)
print(nearest)
k=6
topk_y=[y_train[i] for i in nearest[:k]]
print(topk_y)
"""利用Counter进行统计,返回结果可看成一个字典,统计topk_y中各个元素出现的个数"""
votes=Counter(topk_y)
print(votes)
"""返回预测结果"""
predict_y=votes.most_common(1)[0][0]
print(predict_y)

返回结果：

E:\pythonspace\numpyjichu\venv\Scripts\python.exe E:/pythonspace/numpyjichu/KNN.py
[4.811538215581374, 5.224633958470201, 6.75, 4.75405090422894, 5.831474942070831, 1.480304022827743, 2.356140912594151, 1.3743725841270265, 0.30594117081556693, 2.574975728040946]
[8 7 5 6 9 3 0 1 4 2]
[1, 0, 1, 1, 1, 0]
Counter({1: 4, 0: 2})
1

Process finished with exit code 0

使用scikit-learn中的knn

"""使用sklearn中的knn"""
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from collections import Counter

"""输入原始数据"""
raw_data_X=[[3.39,2.33],
            [3.11,1.78],
            [1.34,3.36],
            [3.52,4.67],
            [2.28,2.86],
            [7.42,4.68],
            [5.74,3.53],
            [9.17,2.51],
            [7.79,3.42],
            [7.93,0.79]
           ]
raw_data_y=[0,0,1,0,0,1,1,0,1,1]

"""将原始数据转换成numpy下的矩阵"""
X_train=np.array(raw_data_X)
y_train=np.array(raw_data_y)
x=np.array([8.09,3.36])
X_predict=x.reshape(1,-1)
kNN_classifier=KNeighborsClassifier(n_neighbors=6)
kNN_classifier.fit(X_train,y_train)
kNN_classifier.predict(X_predict)
y_predict=kNN_classifier.predict(X_predict)
print(y_predict)

导入>>>实例化>>>fit()拟合>>>预测

自己的kNN代码

import numpy as np
from math import sqrt
from collections import Counter
from metrics import accuracy_score

class KNNClassifier:

    def __init__(self, k):
        """初始化kNN分类器"""
        assert k >= 1, "k must be valid"
        self.k = k
        self._X_train = None
        self._y_train = None

    def fit(self, X_train, y_train):
        """根据训练数据集X_train和y_train训练kNN分类器"""
        assert X_train.shape[0] == y_train.shape[0], \
            "the size of X_train must be equal to the size of y_train"
        assert self.k <= X_train.shape[0], \
            "the size of X_train must be at least k."

        self._X_train = X_train
        self._y_train = y_train
        return self

    def predict(self, X_predict):
        """给定待预测数据集X_predict，返回表示X_predict的结果向量"""
        assert self._X_train is not None and self._y_train is not None, \
                "must fit before predict!"
        assert X_predict.shape[1] == self._X_train.shape[1], \
                "the feature number of X_predict must be equal to X_train"

        y_predict = [self._predict(x) for x in X_predict]
        return np.array(y_predict)

    def _predict(self, x):
        """给定单个待预测数据x，返回x的预测结果值"""
        assert x.shape[0] == self._X_train.shape[1], \
            "the feature number of x must be equal to X_train"

        distances = [sqrt(np.sum((x_train - x) ** 2))
                     for x_train in self._X_train]
        nearest = np.argsort(distances)

        topK_y = [self._y_train[i] for i in nearest[:self.k]]
        votes = Counter(topK_y)

        return votes.most_common(1)[0][0]

    def score(self, X_test, y_test):
        """根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""

        y_predict = self.predict(X_test)
        return accuracy_score(y_test, y_predict)

    def __repr__(self):
        return "KNN(k=%d)" % self.k

model_selection:(训练集进行分类，自己的代码)

import numpy as np


def train_test_split(X, y, test_ratio=0.2, seed=None):
    """将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test"""
    assert X.shape[0] == y.shape[0], \
        "the size of X must be equal to the size of y"
    assert 0.0 <= test_ratio <= 1.0, \
        "test_ration must be valid"

    if seed:
        np.random.seed(seed)

    shuffled_indexes = np.random.permutation(len(X))

    test_size = int(len(X) * test_ratio)
    test_indexes = shuffled_indexes[:test_size]
    train_indexes = shuffled_indexes[test_size:]

    X_train = X[train_indexes]
    y_train = y[train_indexes]

    X_test = X[test_indexes]
    y_test = y[test_indexes]

    return X_train, X_test, y_train, y_test

分别调用自己的方法和sklearn中的train_test_split

扫描二维码关注公众号，回复： 2628776 查看本文章

"""使用自己的算法"""
from sklearn import datasets
from model_selection import train_test_split
import numpy as np
from scipy import sparse


iris=datasets.load_iris()

X=iris.data
y=iris.target

x_train,x_test,y_train,y_test=train_test_split(X,y)

print(x_train.shape)
print(y_train.shape)

"""使用sklearn中的train_test_split"""
from sklearn.model_selection import train_test_split
from sklearn import datasets

iris=datasets.load_iris()

X=iris.data
y=iris.target

x_train,x_test,y_train,y_test=train_test_split(X,y)
print(x_train.shape)
print(y_train.shape)
print(y_test.shape)

准确率：

"""使用sklearn中的accuracy_score算法"""
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

iris=datasets.load_iris()
X=iris.data
y=iris.target
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=666)


knn_clf=KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(x_train,y_train)
y_predict=knn_clf.predict(x_test)

acc=accuracy_score(y_test,y_predict)
print(acc)


"""使用自己封装的算法计算准确率"""
from model_selection import train_test_split
from metrics import accuracy_score
from knn import KNNClassifier
from sklearn import datasets
iris=datasets.load_iris()
X=iris.data
y=iris.target
x_train,x_test,y_train,y_test=train_test_split(X,y,test_ratio=0.2,seed=666)

my_knn_clf=KNNClassifier(k=3)
my_knn_clf.fit(x_train,y_train)
y_predict=my_knn_clf.predict(x_test)

acc=accuracy_score(y_test,y_predict)
print(acc)

超参数:

寻找好的参数：领域知识，经验，实验探索

超参数:

寻找最好的k:

考虑距离？？:

距离：

网格搜索中的Grid Search:

grid_search=GridSearchCV(knn_clf,param_grid，n_jobs=-1.verbose=2)

第一个参数表示对哪个分类器进行网格搜索，第二个参数表示网格搜索的参数,第三个参数代表电脑内核（-1）表示全部内核用来处理，verbose值越大，说明搜索的越详细相对时间也就越长。

数据归一化

解决方案：将所有数据映射到同一尺度

最值归一化：把所有数据映射到0-1之间。

公式：

适用于分布有明显边界的情况：容易受边界值影响较大

均值方差归一化

把所有数据归一到均值为0方差为1的分布中

公式：

适用于数据分布没有明显的边界。有可能存在极端数据值

对测试数据集归一化

最基础的分类算法-KNN

猜你喜欢