import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
x = data.iloc[:,:-1]
y = data.iloc[:,-1]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(x,y,test_size=0.2,random_state=420)
clf = KNeighborsClassifier(n_neighbors=3)
clf = clf.fit(Xtrain,Ytrain)
score = clf.score(Xtest,Ytest)
result = clf.predict()
import matplotlib.pyplot as plt
score = [ ]
krange = range(1,20)
for i in krange:
clf = KNeighborsClassifier(n_neighbors=i)
clf = clf.fit(Xtrain,Ytrain)
score.append(clf.score(Xtest,Ytest))
plt.plot(krange,score)
plt.show()
print(score.index(max(score))+1)
又出现一些新问题了,当我们划分数据集的时候,由于是随机划分的,得分最高的k值肯定也不一样,这是我们要引入交叉检验了(k折检验)
from sklearn.model_selection import cross_val_score as CVS
Xtrain,Xtest,Ytrain,Ytest = train_test_split(x,y,test_size=0.2,random_state=420)
clf = KNeighborsClassifier(n_neighbors=3)
cvs = CVS(clf,Xtrain,Ytrain,cv = 5)
print(cvs.mean())
print(cvs.var())
好了,这时候我们可以总结一下
score = [ ]
var = [ ]
krange = range(1,20)
for i in krange:
clf = KNeighborsClassifier(n_neighbors=i)
cvs = CVS(clf,Xtrain,Ytrain,cv = 5)
score.append(cvs.mean())
var.append(cvs.var())
plt.plot(krange,score,color='k')
plt.plot(krange,np.array(score)+np.array(var)*2,c='red',linestyle='--')
plt.plot(krange,np.array(score)-np.array(var)*2,c='red',linestyle='--')
接下来又有一个问题,由于knn的计算涉及到距离,这时候要是各个指标的量纲差距过大,这听起来一点也不好,所以我们得【归一化】
(Normalization,又称 Min-Max Scaling)
这里有个值得注意的一点,【先划分数据集和测试集,在进行归一化!】
from sklearn.preprocessing import MinMaxScaler as mms
Xtrain,Xtest,Ytrain,Ytest = train_test_split(x,y,test_size=0.2,random_state=420)
MMS_01=mms().fit(Xtrain)
MMS_02=mms().fit(Xtest)
X_train=MMS_01.transform(Xtrain)
X_test =MMS_02.transform(Xtest)
score = [ ]
var = [ ]
krange = range(1,20)
for i in krange:
clf = KNeighborsClassifier(n_neighbors=i)
cvs = CVS(clf,X_train,Y_train,cv = 5)
score.append(cvs.mean())
var.append(cvs.var())
plt.plot(krange,score,color='k')
plt.plot(krange,np.array(score)+np.array(var)*2,c='red',linestyle='--')
plt.plot(krange,np.array(score)-np.array(var)*2,c='red',linestyle='--')
print(score.index(max(score))+1)
现在又有一个额外的小问题,
knn的方法采取的是一点一票的方法,看在一个范围内哪个阵营的人多,
但是按常理来讲,一个未知的点肯定和距离他更近的点更加相似,
假如k为8时,一个点的5个邻居离他较远,3个邻居离他很近,
这时候knn会将他判断为5人组里,但是生活中我们肯定会判断给三人组
这时候就要加上【距离惩罚】,但是优化效果得在实际中慢慢斟酌,
适合那些异常点较多的数据集里
用法:加入weights = ‘distance’,表示加上距离惩罚
score = [ ]
var = [ ]
krange = range(1,20)
for i in krange:
clf = KNeighborsClassifier(n_neighbors=i,weights='distance')
cvs = CVS(clf,X_train,Y_train,cv = 5)
score.append(cvs.mean())
var.append(cvs.var())