from numpy import * import operator import os # 创建一个数据集,用于测试 def createDataset(): group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) labels = ['A', 'A', 'B', 'B'] return group, labels # k近邻算法 def classifier0(inX, dataset, labels, k): datasetSize = dataset.shape[0] diffMat = tile(inX, (datasetSize, 1)) - dataset distance = ((diffMat**2).sum(axis=1))**0.5 sortedDistIndicies = distance.argsort() classcount = {} for i in range(k): voteILabel = labels[sortedDistIndicies[i]] classcount[voteILabel] = classcount.get(voteILabel, 0) + 1 sortedClassCount = sorted(classcount.items(), key=operator.itemgetter(1), reverse=True) return sortedClassCount[0][0] # 相关函数说明 # tile(A,(m,n)),将数组A在行上重复m次、在列上重复n次,构成一个新的数组 # numpy.argsort(),返回数组值从小到大的索引值 # dict.get(key, default=None),返回指定键的值,如果值不在字典中返回默认值None # sorted(iterable[, cmp[, key[, reverse]]]) # iterable可迭代对象。 # cmp比较的函数,这个具有两个参数,参数的值都是从可迭代对象中取出,大于则返回1,小于则返回-1,等于则返回0。 # key主要是用来进行比较的元素,只有一个参数,取自于可迭代对象中,指定可迭代对象中的一个元素来进行排序。 # reverse排序规则,True降序,False升序(默认) # 测试 # group, labels = createDataset() # clf = classifier0([0, 0], group, labels, 3) # print(clf) # 【实例】Hellen约会对象分类--导入数据 def file2matrix(filenname): lines = open(filenname).readlines() num = len(lines) returnMat = zeros((num, 3)) classLabelVector = [] index = 0 for line in lines: returnMat[index, :] = line.strip().split('\t')[0:3] classLabelVector.append(int(line.strip().split('\t')[-1])) index += 1 return returnMat, classLabelVector # 数据分析--可视化 # filename = 'D:/2. 数据分析/机器学习实战/machinelearninginaction/Ch02/datingTestSet2.txt' # datingMat, datingLabels = file2matrix(filename) # print(datingMat) # print(datingLabels) # import matplotlib.pyplot as plt # plt.figure().add_subplot(111).scatter(datingMat[:, 1], datingMat[:, 2]) # plt.show() # 相关函数说明 # add_subplot(349) # 参数349:将画布分割成3行4列,图像画在从左到右从上到下的第9块 # 3410是不行的,可以用另一种方式,(3,4,10) # 数据归一化处理 def autoNorm(dataset): minVals = dataset.min(0) maxVals = dataset.max(0) ranges = maxVals - minVals normDataset = zeros(shape(dataset)) m = dataset.shape[0] normDataset = dataset - tile(minVals, (m, 1)) normDataset = normDataset/tile(ranges, (m, 1)) return normDataset, ranges, minVals # 测试分类器效果 def datingClassTest(): hoRatio = 0.10 datingMat, datingLabels = file2matrix('D:/2. 数据分析/机器学习实战/machinelearninginaction/Ch02/datingTestSet2.txt') normMat, ranges, minVal = autoNorm(datingMat) m = normMat.shape[0] numTestVecs = int(m*hoRatio) errorCount = 0.0 for i in range(numTestVecs): clf = classifier0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3) print('the classifier came back with: %d, the real answer is: %d' % (clf, datingLabels[i])) if clf != datingLabels[i]: errorCount += 1 print('the total error rate is %f' % (errorCount/float(numTestVecs))) # datingClassTest() # the total error rate is 0.050000 # 帮助Hellen对一个新约会对象进行分类 def classifyPerson(): results = ['not at all', 'is small doses', 'is large doses'] percentTat = float(input('percentage of time spent playing cideo games?')) ffMiles = float(input('frequent flier miles earned per year')) iceCream = float(input('liters of ice cream consumed per year')) datingMat, datingLabel = file2matrix('D:/2. 数据分析/机器学习实战/machinelearninginaction/Ch02/datingTestSet2.txt') normMat, ranges, minVal = autoNorm(datingMat) inArr = array([ffMiles, percentTat, iceCream]) clf = classifier0((inArr-minVal)/ranges, normMat, datingLabel, 3) print('you will probably like this person', results[clf-1]) # 【实例】手写数字识别 # 第一次编写的导入数据代码,模型错误率特别高,the error rate is: 0.821353 # def img2vector(filename): # returnVect = zeros((1, 1024)) # for i in range(32): # line = open(filename).readline() # for j in range(32): # returnVect[0, 32*i+j] = int(line[j]) # return returnVect # 第二次编写的代码,跟课本上的一模一样,模型错误率大幅度下降,the error rate is: 0.010571 def img2vector(filename): returnVect = zeros((1, 1024)) fr = open(filename) for i in range(32): lineStr = fr.readline() for j in range(32): returnVect[0, 32*i+j] = int(lineStr[j]) return returnVect # 两个导入数据函数最主要的区别在于,循环语句中的readline()函数的对象 # 第一次编写的代码,每次都重新打开文件,每次读取的都是第一行,数据错误 # 《python机器学习应用》中的导入数据代码,数据为一维数组,效率更高 # def img2vector(fileName): # retMat = zeros([1024], int) # lines = open(fileName).readlines() # for i in range(32): # for j in range(32): # retMat[i*32+j] = lines[i][j] # return retMat def handwritingClassTest(): labels = [] train_path = 'D:/2. 数据分析/机器学习实战/machinelearninginaction/Ch02/digits/trainingDigits' trainingFileList = os.listdir(train_path) m = len(trainingFileList) trainingMat = zeros((m, 1024)) for i in range(m): filename = trainingFileList[i] label = int(filename.split('_')[0]) labels.append(label) trainingMat[i, :] = img2vector(train_path + '/' + filename) test_path = 'D:/2. 数据分析/机器学习实战/machinelearninginaction/Ch02/digits/testDigits' testFileList = os.listdir(test_path) errorCount = 0 mTest = len(testFileList) for i in range(mTest): filename_test = testFileList[i] label_test = int(filename_test.split('_')[0]) vector_test = img2vector(test_path + '/' + filename_test) clf = classifier0(vector_test, trainingMat, labels, 3) # print('the classifier came back with: %d, the real answer is: %d' % (clf, label_test)) if clf != label_test: errorCount += 1 print('the total nummer of error is: ', errorCount) print('the error rate is: %f' % float(errorCount/mTest)) # handwritingClassTest() # the total nummer of error is: 10 # the error rate is: 0.010571
机器学习实战——k-近邻算法(代码)
猜你喜欢
转载自blog.csdn.net/Hu_Pengxue/article/details/80894218
今日推荐
周排行