#Machine Learning in Action: 机器学习实战, [美]Perer Harrington from numpy import * import operator import matplotlib import matplotlib.pyplot as plt from os import listdir import time #导入数据 def creatDataSet(): group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]]) labels = ['A','A','B','B'] return group, labels #KNN分类算法 def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] diffMat = tile(inX, (dataSetSize,1)) - dataSet #tile复制几份 inX 的值 # print("diffMat:\n",diffMat) sqDiffMat = diffMat ** 2 #距离计算 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances ** 0.5 # print(distances) sortedDistIndicies = distances.argsort() #返回下标,按距离最小到最大顺序排列, [2 3 1 0] ;;将元素从小到大排列,提取其对应的index(索引) # print(sortedDistIndicies) classCount = {} for i in range(k): #选择距离最小的k个点,,疑问??? voteIlabel = labels[sortedDistIndicies[i]] #返回标签值--B B A classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 #dict.get(key, default=None),返回指定键的值,如果值不在字典中返回默认值None。 # print("voteIlabel:\n",classCount[voteIlabel]) sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1),reverse =True) #排序,reversed取值为True时候就是倒序排,默认为False正序从小到大排 # key:用列表元素的某个属性或函数进行作为关键字 # print("sortedClassCount:\n",sortedClassCount) #[('A', 1), ('B', 2)], 两种属性--A、B return sortedClassCount[0][0] #将文本转换为Numpy能识别的格式 def file2matrix(filename): fr = open(filename) arrayOfLines = fr.readlines() numberOfLines = len(arrayOfLines) #文件行数 ,"datingTestSet2.txt"有1000行,4列 returnMat = zeros((numberOfLines, 3)) #创建返回的Numpy矩阵 classLabelVector = [] index = 0 for line in arrayOfLines: line = line.strip() #截取所有的回车字符 listFromLine = line.split('\t') #使用tab字符将数据分割成元素列表 returnMat[index,:] = listFromLine[0:3] #三个数据 classLabelVector.append(int(listFromLine[-1])) #最后一列,标签类别 index += 1 return returnMat, classLabelVector # print(listFromLine[0:4]) #归一化特征值 def autoNorm(dataSet): minVals = dataSet.min(0) #参数0表示从列中选取最小值 maxVals = dataSet.max(0) ranges = maxVals - minVals normDataSet = zeros(shape(dataSet)) m = dataSet.shape[0] normDataSet = dataSet - tile(minVals, (m,1)) normDataSet = normDataSet/tile(ranges, (m,1)) return normDataSet, ranges, maxVals, minVals #测试KNN模型 def datingClassTest(): hoRatio = 0.10 datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') normMat, ranges, maxVals, minVals = autoNorm(datingDataMat) m = normMat.shape[0] numTestVecs = int(m*hoRatio) #1000*hoRatio errorCount =0.0 for i in range(numTestVecs): classfierResult = classify0(normMat[i,:], normMat[numTestVecs:m ,:], #判断前面100行属于下面的900行的哪一类 datingLabels[numTestVecs:m], 3) # print("classfier: %d, real: %d" % (classfierResult, datingLabels[i])) if (classfierResult != datingLabels[i]): errorCount += 1.0 print("error_rate: %f" % (errorCount/float(numTestVecs))) print("acc_rate: %f" % (1-(errorCount/float(numTestVecs)))) # print(normMat) #约会网站预测函数 def classifyPerson(): resultList = ['not','small','large'] percentTats = float(input("percentage of time?")) ffMiles = float(input("frequent flier miles earned?")) iceCream = float(input("liters of ice consumed?")) datingDataMat, datingLabels = file2matrix('datingTestSet2.txt') normMat, ranges, maxVals, minVals = autoNorm(datingDataMat) inArr = array([ffMiles, percentTats, iceCream]) classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLabels, 3) print("you like person:", resultList[classifierResult - 1]) #为啥减去1 ?? print(classifierResult) #图像转换为测试向量 def img2vector(filename): returnVector = zeros((1,1024)) fr = open(filename) for i in range(32): lineStr = fr.readline() for j in range(32): returnVector[0,32*i+j] = int(lineStr[j]) # print("returnVector:",returnVector[0,32*i:32*i+j]) return returnVector #手写数字识别系统测试 def handwritingClassTest(): hwLabels = [] trainingFileList = listdir('trainingDigits') m = len(trainingFileList) #1934 trainingMat = zeros((m, 1024)) for i in range(m): fileNameStr = trainingFileList[i] # fileStr = fileNameStr.split('.') # ['9_99', 'txt'] 等 fileStr = fileNameStr.split('.')[0] #split--指定分隔符对字符串进行切片,参数num仅分隔num个子字符串,指定字符不再出现。 classNumStr = int(fileStr.split('_')[0]) # 输出 0, 1 ,2 ,3 ,...9 hwLabels.append(classNumStr) trainingMat[i, :] = img2vector('trainingDigits/%s' % fileNameStr) testFileList = listdir('testDigits') errorCount = 0.0 mTest = len(testFileList) #946 for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split('.')[0] classNumStr = int(fileStr.split('_')[0]) vectorUnderTest = img2vector('testDigits/%s' % (fileNameStr)) classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) print("classifier: %d, real_answer: %d" % (classifierResult, classNumStr)) if (classifierResult != classNumStr): errorCount += 1.0 print("error_num:",errorCount) print("error_rate:", (errorCount/float(mTest))) if __name__ == '__main__': #创建数据 group, labels = creatDataSet() print("group:\n", group) print("labels:\n",labels) #KNN算法 sortedClassCount= classify0([0,0], group, labels, 3) # print("sortedClassCount[0][0]:\n",sortedClassCount) #文本转换为Numpy格式 datingDataMat, datingLabels = file2matrix("datingTestSet2.txt") print("datingDataMat:\n",datingDataMat) # print("datingLabels:\n", datingLabels) # fig = plt.figure() #Creates a new figure. # ax = fig.add_subplot(111) # # ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2]) #散点图,一种颜色 # ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2], 15.0*array(datingLabels), 15.0*array(datingLabels)) #散点图, 三种颜色 # plt.show() #归一化特征值 normMat, ranges, maxVals, minVals = autoNorm(datingDataMat) # print(normMat, ranges, maxVals, minVals) #测试代码 datingClassTest() #约会网站预测函数 # classifyPerson() #图像转换为测试向量 testVector = img2vector('testDigits/1_13.txt') # print("testVector:\n", testVector[0,32*5:32*5+32]) #手写数字识别系统测试 star = time.time() handwritingClassTest() end = time.time() print("测试时间:%f s" % (end-star)) #测试时间:45.612609s
KNN-机器学习实战-Peter Harrington
猜你喜欢
转载自blog.csdn.net/qq_34638161/article/details/81036983
今日推荐
周排行