在本文中,将选取正确率作为衡量标准,自行实现搜索最优超参数,而非直接调用sklearn中的GridSearchCV。
先简单介绍KNN中的三个超参数:
# n_neighbors:取邻近点的个数k。k取1-10测试 #weight:距离的权重;uniform:一致的权重;distance:距离的倒数作为权重 #p:闵可斯基距离的p值;闵可斯基距离公式: (∑|xi-yi|^p)^(1/p) (i=1,2,...n)。p=1:即欧式距离;p=2:即曼哈顿距离;p取1-6测试 #只有当weight=distance的时候,p值才有意义
if __name__ == '__main__': iris=datasets.load_iris() trainX, testX, trainY, testY = train_test_split(iris.data,iris.target) searchBestPar()
#寻找最优超参数:n_neighbors /weight/p def searchBestPar(): bestScore=0 bestK=-1 bestWeight="" # weight==uniform时 for k in range(1,11): clf = KNeighborsClassifier(n_neighbors=k,weights="uniform") clf.fit(trainX,trainY) scor=clf.score(testX,testY) if scor > bestScore: bestScore=scor bestK=k bestWeight="uniform" # weight==distance时 for k in range(1,10): for p in range(1,7): clf=KNeighborsClassifier(n_neighbors=k,weights="distance",p=p) clf.fit(trainX,trainY) scor = clf.score(testX, testY) if scor > bestScore: bestScore = scor bestK = k bestWeight = "distance" print("the best n_neighbors", bestK) print("the best weights", bestWeight) print("the best p", p)