knn_1
start
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
载入文件,查看一下
cancer = pd.read_csv('./cancer.csv',sep='\t')
display(cancer.shape,cancer.columns)
cancer.head()
- (569, 32)
Index([‘ID’, ‘Diagnosis’, ‘radius_mean’, ‘texture_mean’, ‘perimeter_mean’,‘area_mean’, ‘smoothness_mean’, ‘compactness_mean’, ‘concavity_mean’, ‘concave_mean’, ‘symmetry_mean’, ‘fractal_mean’, ‘radius_sd’, ‘texture_sd’, ‘perimeter_sd’, ‘area_sd’, ‘smoothness_sd’, ‘compactness_sd’, ‘concavity_sd’, ‘concave_sd’, ‘symmetry_sd’, ‘fractal_sd’, ‘radius_max’, ‘texture_max’, ‘perimeter_max’, ‘area_max’,‘smoothness_max’, ‘compactness_max’, ‘concavity_max’, ‘concave_max’,
‘symmetry_max’, ‘fractal_max’],
dtype=‘object’)
ID Diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave_mean … radius_max texture_max perimeter_max area_max smoothness_max compactness_max concavity_max concave_max symmetry_max fractal_max
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.3001 0.14710 … 25.38 17.33 184.60 2019.0 0.1622 0.6656 0.7119 0.2654 0.4601 0.11890
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.0869 0.07017 … 24.99 23.41 158.80 1956.0 0.1238 0.1866 0.2416 0.1860 0.2750 0.08902
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.1974 0.12790 … 23.57 25.53 152.50 1709.0 0.1444 0.4245 0.4504 0.2430 0.3613 0.08758
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.2414 0.10520 … 14.91 26.50 98.87 567.7 0.2098 0.8663 0.6869 0.2575 0.6638 0.17300
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.1980 0.10430 … 22.54 16.67 152.20 1575.0 0.1374 0.2050 0.4000 0.1625 0.2364 0.07678
5 rows × 32 columns
取出分类特征和分类结果,分别用X,y接收
X = cancer.iloc[:,2:]
y = cancer['Diagnosis']
划分数据
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) # 默认分成3:1
- train_test_split 可以自动(X与y同步)打乱数据并进行切分;X–>X_train,X_test…
- test_size 可以是比例值,即划分的测试数据占总样本的比例;也可以是具体的数量
训练并查看预测准确率
knn = KNeighborsClassifier() # 默认邻居为5
knn.fit(X_train,y_train) # 训练
knn.score(X_test,y_test) # 0.9385964912280702
多次训练,取平均值
def knn_score(X): # 进行一下封装
knn = KNeighborsClassifier(n_neighbors=5)
s_ = 0
for i in range(100):
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
knn.fit(X_train,y_train)
s_ += knn.score(X_test,y_test)/100
return s_
s_ = knn_score(X) # 0.927280701754386
将特征归一化,再次进行训练测试
# X_r1 = (X - X.mean())/X.std() # z-score 归一化
X_r1 = (X-X.min(axis = 0))/(X.max(axis = 0) - X.min(axis = 0))
knn_score(X_r1) # 0.9686842105263161
-
可见,特征的归一化有时可以提高准确率
-
在sklearn的预处理包中也提供了归一化方法
from sklearn.preprocessing import MinMaxScaler,StandardScaler
min_max_scaller = MinMaxScaler() # 最大最小值归一化 min_max_scaller.fit(X) X_min_max = min_max_scaller.transform(X) X_min_max # ndarray类型
standard_scaler = StandardScaler() # z-score standard_scaler.fit(X) X_standard = standard_scaler.transform(X) X_standard
交叉验证
from sklearn.model_selection import cross_val_score,KFold
knn = KNeighborsClassifier()
score = cross_val_score(knn,X,y,cv=10)
# score = cross_val_score(knn,X,y,cv=KFold(10))
score,score.mean()
- (array([0.91379310.87931034,0.89473684,0.96491228,0.94736842, 0.92982456, 0.96491228, 0.92857143,0.91071429,0.96428571]),
0.9298429262812202)
cv:
- None, to use the default 3-fold cross validation,
- integer, to specify the number of folds in a(Stratified)KFold
,
- :term:CV splitter
,
- An iterable yielding (train, test) splits as arrays of indices.
-
交叉验证,即将数据均匀分成若干折(段),每一折(段)轮流作为测试数据;
-
cv = 10 即分成10折(段)
-
KFold 可以自行使用,类似train_test_split
A = np.random.randint(0,100,size=8) y = np.array([0,1,1,0,0,0,1,0]) kf = KFold(n_splits=4) for X_train,X_test in kf.split(A,y): print(X_train,X_test)
- [2 3 4 5 6 7] [0 1]
[0 1 4 5 6 7] [2 3]
[0 1 2 3 6 7] [4 5]
[0 1 2 3 4 5] [6 7]
- [2 3 4 5 6 7] [0 1]