import numpy as np
import pandas as pd
import operator
import csv
# inX:进行类别判断的一条数据
# trainSet:训练数据
# labels:每条数据对应的标签
# k:近邻数量
def classify(inX, trainSet, labels, k):
trainSetSize = trainSet.shape[0]
diffMatrix = np.tile(inX, (trainSetSize, 1)) - trainSet
sqDiffMatrix = diffMatrix ** 2
sqDistance = sqDiffMatrix.sum(axis=1)
distances = sqDistance ** 0.5
# argsort() 返回排序索引
sortedDistanceIndices = distances.argsort()
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistanceIndices[i]]
classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def DigitRecognition():
# 构造训练集和测试集
pdTrainData = pd.read_csv("train.csv")
# 转换成array对象
trainSet = pdTrainData.values
# 对array对象进行切片操作
trainSetLabels = trainSet[:, 0]
# 将标签数据从训练数据集中剔除,方便和测试数据进行数值运算
trainSet = trainSet[:, 1:]
pdTestData = pd.read_csv("test.csv")
testSet = pdTestData.values
# 保存结果的列表
index = []
result = []
# 对测试集中的每条数据进行处理
for i in range(testSet.shape[0]):
print("开始判断第%d条数据"%i)
index.append(i + 1)
predictNum = classify(testSet[i], trainSet, trainSetLabels, 3)
result.append(predictNum)
# 将数据保存到csv文件中
predictions = pd.DataFrame({"ImageId":index, "Label":result})
predictions.to_csv("submission.csv", index=False)
DigitRecognition()
识别的准确率在97%左右
Kaggle笔记:DigitRecognition 数字识别 kNN解法
猜你喜欢
转载自blog.csdn.net/u012706792/article/details/77979935
今日推荐
周排行