from numpy import *
import operator
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
#使用k-近邻算法制作分类器
def classify0(inX,dataSet,labels,k):
dataSetSize = dataSet.shape[0] # 取dataset的行数
diffMat = tile(inX,(dataSetSize,1))-dataSet #向量化,将测试数据inX写成dataSetSize组,分别减去dataSet的每一行
sqDiffMat = diffMat**2 # 平方
sqDistances = sqDiffMat.sum(axis=1)#axis=1是将矩阵的每一行向量相加
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort() #argsort()函数先将元素从小到大排列,提取其对应的index(索引号)
classCount = {}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]] #获取标签
classCount[voteIlabel] = classCount.get(voteIlabel,0)+1 #在dict中对应的value值加1
sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse=True) #按照value值排序
return sortedClassCount[0][0]
#将文件里面的数据转换成矩阵
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines()
numberOfLines = len(arrayOLines) #得到文件行数
returnMat = zeros((numberOfLines,3))
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip() #修剪前后的空格
listFromLine = line.split('\t')
returnMat[index,:] = listFromLine[0:3]
classLabelVector.append(int(listFromLine[-1]))
index += 1
return returnMat, classLabelVector
#图形化描述数据分布
def pic(datingDataMat,datingLabels):
fig = plt.figure()
ax = fig.add_subplot(111) # 1*1网格,第一子图
datingLabels = array(datingLabels)
idx_1 = np.where(datingLabels==1) # 返回datingLabels中1的所有索引,是一个array
ax.scatter(datingDataMat[idx_1,0],datingDataMat[idx_1,1],marker='+',c = 'r',label='1',s = 10)
idx_2 = np.where(datingLabels == 2) # 返回datingLabels中1的所有索引,是一个array
ax.scatter(datingDataMat[idx_2, 0], datingDataMat[idx_2, 1], marker='*', c='b', label='2', s=20)
idx_3 = np.where(datingLabels == 3) # 返回datingLabels中1的所有索引,是一个array
ax.scatter(datingDataMat[idx_3, 0], datingDataMat[idx_3, 1], marker='d', c='y', label='3', s=30)
plt.ylabel('ice cream')
plt.xlabel('play')
plt.show()
#归一化数值
def autoNorm(dataSet):
minValues = dataSet.min(0)
maxValues = dataSet.max(0)
ranges = maxValues - minValues
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minValues,(m,1))
matB = tile(ranges,(m,1))
normDataSet = normDataSet/matB
return normDataSet,ranges,minValues
#测试数据的错误率
def datingClassTest():
hoRatio = 0.1
datingDataMat, datingLabels = file2matrix('E:/BaiduNetdiskDownload/machinelearninginaction/Ch02/datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
m = normMat.shape[0]
numTestVecs = int(m * hoRatio) # 用于测试的样本,样本集剩余的样本用于训练
errorCount = 0 # 用于记录错误测试样本的数量
for i in range(numTestVecs):
classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
print('the test result is %d,the real result is %d' % (classifierResult, datingLabels[i]))
if classifierResult is not datingLabels[i]:
errorCount += 1
print('the total error rate is :', errorCount / float(numTestVecs))
#约会网站预测函数
def classifyPerson():
resultList = ['not at all','in small doses','in large doses']
percentTats = float(input('percentage of time spent playing video games?'))
ffMiles = float(input('frequent flier miles earned per year?'))
iceCream = float(input('liters of ice cream consumed per year?'))
datingDataMat, datingLabels = file2matrix('E:/BaiduNetdiskDownload/machinelearninginaction/Ch02/datingTestSet2.txt')
normMat, ranges, minVals = autoNorm(datingDataMat)
inX = array([percentTats,ffMiles,iceCream])
resultLabel = classify0((inX-minVals)/ranges,normMat,datingLabels,3)
print('You will probably like this person : ',resultList[resultLabel-1])
if __name__ == '__main__':
classifyPerson()
k-近邻算法实战1——约会网站
猜你喜欢
转载自blog.csdn.net/lwycc2333/article/details/81489474
今日推荐
周排行