k-近邻算法实战1——约会网站

from numpy import *
import operator
import numpy as np
import matplotlib
import matplotlib.pyplot as plt


#使用k-近邻算法制作分类器
def classify0(inX,dataSet,labels,k):
    dataSetSize = dataSet.shape[0] # 取dataset的行数
    diffMat = tile(inX,(dataSetSize,1))-dataSet #向量化,将测试数据inX写成dataSetSize组，分别减去dataSet的每一行
    sqDiffMat = diffMat**2 # 平方
    sqDistances = sqDiffMat.sum(axis=1)#axis=1是将矩阵的每一行向量相加
    distances = sqDistances**0.5
    sortedDistIndicies = distances.argsort() #argsort()函数先将元素从小到大排列，提取其对应的index(索引号)
    classCount = {}
    for i in range(k):
        voteIlabel = labels[sortedDistIndicies[i]] #获取标签
        classCount[voteIlabel] = classCount.get(voteIlabel,0)+1 #在dict中对应的value值加1
    sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse=True) #按照value值排序
    return sortedClassCount[0][0]


#将文件里面的数据转换成矩阵
def file2matrix(filename):
    fr = open(filename)
    arrayOLines = fr.readlines()
    numberOfLines = len(arrayOLines) #得到文件行数
    returnMat = zeros((numberOfLines,3))
    classLabelVector = []
    index = 0
    for line in arrayOLines:
        line = line.strip() #修剪前后的空格
        listFromLine = line.split('\t')
        returnMat[index,:] = listFromLine[0:3]
        classLabelVector.append(int(listFromLine[-1]))
        index += 1
    return returnMat, classLabelVector

#图形化描述数据分布
def pic(datingDataMat,datingLabels):
    fig = plt.figure()
    ax = fig.add_subplot(111) # 1*1网格，第一子图
    datingLabels = array(datingLabels)

    idx_1 = np.where(datingLabels==1) # 返回datingLabels中1的所有索引，是一个array
    ax.scatter(datingDataMat[idx_1,0],datingDataMat[idx_1,1],marker='+',c = 'r',label='1',s = 10)

    idx_2 = np.where(datingLabels == 2)  # 返回datingLabels中1的所有索引，是一个array
    ax.scatter(datingDataMat[idx_2, 0], datingDataMat[idx_2, 1], marker='*', c='b', label='2', s=20)

    idx_3 = np.where(datingLabels == 3)  # 返回datingLabels中1的所有索引，是一个array
    ax.scatter(datingDataMat[idx_3, 0], datingDataMat[idx_3, 1], marker='d', c='y', label='3', s=30)
    plt.ylabel('ice cream')
    plt.xlabel('play')
    plt.show()

#归一化数值
def autoNorm(dataSet):
    minValues = dataSet.min(0)
    maxValues = dataSet.max(0)
    ranges = maxValues - minValues
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minValues,(m,1))
    matB  = tile(ranges,(m,1))
    normDataSet = normDataSet/matB
    return normDataSet,ranges,minValues

#测试数据的错误率
def datingClassTest():
    hoRatio = 0.1
    datingDataMat, datingLabels = file2matrix('E:/BaiduNetdiskDownload/machinelearninginaction/Ch02/datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m * hoRatio)  # 用于测试的样本，样本集剩余的样本用于训练
    errorCount = 0  # 用于记录错误测试样本的数量
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
        print('the test result is %d,the real result is %d' % (classifierResult, datingLabels[i]))
        if classifierResult is not datingLabels[i]:
            errorCount += 1
    print('the total error rate is :', errorCount / float(numTestVecs))


#约会网站预测函数
def classifyPerson():
    resultList = ['not at all','in small doses','in large doses']
    percentTats = float(input('percentage of time spent playing video games?'))
    ffMiles = float(input('frequent flier miles earned per year?'))
    iceCream = float(input('liters of ice cream consumed per year?'))
    datingDataMat, datingLabels = file2matrix('E:/BaiduNetdiskDownload/machinelearninginaction/Ch02/datingTestSet2.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inX = array([percentTats,ffMiles,iceCream])
    resultLabel = classify0((inX-minVals)/ranges,normMat,datingLabels,3)
    print('You will probably like this person : ',resultList[resultLabel-1])

if __name__ == '__main__':
    classifyPerson()
k-近邻算法实战1——约会网站

猜你喜欢