1.K近邻算法基础
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from collections import Counter
"""输入原始数据"""
raw_data_X=[[3.39,2.33],
[3.11,1.78],
[1.34,3.36],
[3.52,4.67],
[2.28,2.86],
[7.42,4.68],
[5.74,3.53],
[9.17,2.51],
[7.79,3.42],
[7.93,0.79]
]
raw_data_y=[0,0,1,0,0,1,1,0,1,1]
"""将原始数据转换成numpy下的矩阵"""
X_train=np.array(raw_data_X)
y_train=np.array(raw_data_y)
x=np.array([8.09,3.36])
"""数据可视化"""
plt.scatter(X_train[y_train==0,0],X_train[y_train==0,1],color='r')
plt.scatter(X_train[y_train==1,0],X_train[y_train==1,1],color='g')
plt.scatter(x[0],x[1],color='b')
plt.show()
"""计算距离,并对生成的距离数组索引排序保存在nearest中"""
distances=[sqrt(np.sum((x_train-x)**2))for x_train in X_train ]
print(distances)
nearest=np.argsort(distances)
print(nearest)
k=6
topk_y=[y_train[i] for i in nearest[:k]]
print(topk_y)
"""利用Counter进行统计,返回结果可看成一个字典,统计topk_y中各个元素出现的个数"""
votes=Counter(topk_y)
print(votes)
"""返回预测结果"""
predict_y=votes.most_common(1)[0][0]
print(predict_y)
返回结果:
E:\pythonspace\numpyjichu\venv\Scripts\python.exe E:/pythonspace/numpyjichu/KNN.py
[4.811538215581374, 5.224633958470201, 6.75, 4.75405090422894, 5.831474942070831, 1.480304022827743, 2.356140912594151, 1.3743725841270265, 0.30594117081556693, 2.574975728040946]
[8 7 5 6 9 3 0 1 4 2]
[1, 0, 1, 1, 1, 0]
Counter({1: 4, 0: 2})
1
Process finished with exit code 0
使用scikit-learn中的knn
"""使用sklearn中的knn"""
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from collections import Counter
"""输入原始数据"""
raw_data_X=[[3.39,2.33],
[3.11,1.78],
[1.34,3.36],
[3.52,4.67],
[2.28,2.86],
[7.42,4.68],
[5.74,3.53],
[9.17,2.51],
[7.79,3.42],
[7.93,0.79]
]
raw_data_y=[0,0,1,0,0,1,1,0,1,1]
"""将原始数据转换成numpy下的矩阵"""
X_train=np.array(raw_data_X)
y_train=np.array(raw_data_y)
x=np.array([8.09,3.36])
X_predict=x.reshape(1,-1)
kNN_classifier=KNeighborsClassifier(n_neighbors=6)
kNN_classifier.fit(X_train,y_train)
kNN_classifier.predict(X_predict)
y_predict=kNN_classifier.predict(X_predict)
print(y_predict)
导入>>>实例化>>>fit()拟合>>>预测
自己的kNN代码
import numpy as np
from math import sqrt
from collections import Counter
from metrics import accuracy_score
class KNNClassifier:
def __init__(self, k):
"""初始化kNN分类器"""
assert k >= 1, "k must be valid"
self.k = k
self._X_train = None
self._y_train = None
def fit(self, X_train, y_train):
"""根据训练数据集X_train和y_train训练kNN分类器"""
assert X_train.shape[0] == y_train.shape[0], \
"the size of X_train must be equal to the size of y_train"
assert self.k <= X_train.shape[0], \
"the size of X_train must be at least k."
self._X_train = X_train
self._y_train = y_train
return self
def predict(self, X_predict):
"""给定待预测数据集X_predict,返回表示X_predict的结果向量"""
assert self._X_train is not None and self._y_train is not None, \
"must fit before predict!"
assert X_predict.shape[1] == self._X_train.shape[1], \
"the feature number of X_predict must be equal to X_train"
y_predict = [self._predict(x) for x in X_predict]
return np.array(y_predict)
def _predict(self, x):
"""给定单个待预测数据x,返回x的预测结果值"""
assert x.shape[0] == self._X_train.shape[1], \
"the feature number of x must be equal to X_train"
distances = [sqrt(np.sum((x_train - x) ** 2))
for x_train in self._X_train]
nearest = np.argsort(distances)
topK_y = [self._y_train[i] for i in nearest[:self.k]]
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
def score(self, X_test, y_test):
"""根据测试数据集 X_test 和 y_test 确定当前模型的准确度"""
y_predict = self.predict(X_test)
return accuracy_score(y_test, y_predict)
def __repr__(self):
return "KNN(k=%d)" % self.k
model_selection:(训练集进行分类,自己的代码)
import numpy as np
def train_test_split(X, y, test_ratio=0.2, seed=None):
"""将数据 X 和 y 按照test_ratio分割成X_train, X_test, y_train, y_test"""
assert X.shape[0] == y.shape[0], \
"the size of X must be equal to the size of y"
assert 0.0 <= test_ratio <= 1.0, \
"test_ration must be valid"
if seed:
np.random.seed(seed)
shuffled_indexes = np.random.permutation(len(X))
test_size = int(len(X) * test_ratio)
test_indexes = shuffled_indexes[:test_size]
train_indexes = shuffled_indexes[test_size:]
X_train = X[train_indexes]
y_train = y[train_indexes]
X_test = X[test_indexes]
y_test = y[test_indexes]
return X_train, X_test, y_train, y_test
分别调用自己的方法和sklearn中的train_test_split
扫描二维码关注公众号,回复:
2628776 查看本文章
"""使用自己的算法"""
from sklearn import datasets
from model_selection import train_test_split
import numpy as np
from scipy import sparse
iris=datasets.load_iris()
X=iris.data
y=iris.target
x_train,x_test,y_train,y_test=train_test_split(X,y)
print(x_train.shape)
print(y_train.shape)
"""使用sklearn中的train_test_split"""
from sklearn.model_selection import train_test_split
from sklearn import datasets
iris=datasets.load_iris()
X=iris.data
y=iris.target
x_train,x_test,y_train,y_test=train_test_split(X,y)
print(x_train.shape)
print(y_train.shape)
print(y_test.shape)
准确率:
"""使用sklearn中的accuracy_score算法"""
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
iris=datasets.load_iris()
X=iris.data
y=iris.target
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=666)
knn_clf=KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(x_train,y_train)
y_predict=knn_clf.predict(x_test)
acc=accuracy_score(y_test,y_predict)
print(acc)
"""使用自己封装的算法计算准确率"""
from model_selection import train_test_split
from metrics import accuracy_score
from knn import KNNClassifier
from sklearn import datasets
iris=datasets.load_iris()
X=iris.data
y=iris.target
x_train,x_test,y_train,y_test=train_test_split(X,y,test_ratio=0.2,seed=666)
my_knn_clf=KNNClassifier(k=3)
my_knn_clf.fit(x_train,y_train)
y_predict=my_knn_clf.predict(x_test)
acc=accuracy_score(y_test,y_predict)
print(acc)
超参数:
寻找好的参数:领域知识,经验,实验探索
超参数:
寻找最好的k:
考虑距离??:
距离:
网格搜索中的Grid Search:
grid_search=GridSearchCV(knn_clf,param_grid,n_jobs=-1.verbose=2)
第一个参数表示对哪个分类器进行网格搜索,第二个参数表示网格搜索的参数,第三个参数代表电脑内核(-1)表示全部内核用来处理,verbose值越大,说明搜索的越详细相对时间也就越长。
数据归一化
解决方案:将所有数据映射到同一尺度
最值归一化:把所有数据映射到0-1之间。
公式:
适用于分布有明显边界的情况:容易受边界值影响较大
均值方差归一化
把所有数据归一到均值为0方差为1的分布中
公式:
适用于数据分布没有明显的边界。有可能存在极端数据值
对测试数据集归一化