#!/usr/bin/env python
# -*- Coding:UTF-8 -*-
import numpy as np
import operator
import os
def create_dataset():
group = np.array([[1.0, 1.1], [1.0, 1.0], [0.0, 0.0], [0.0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
def class_dataset(test_data, train_data, train_label, k):
'''
function:knn分类
:param test_data:
:param train_data:
:param train_label:
:param k:
:return:
程序亮点:np.argsort, get, sorted, operator.itemgetter
'''
test_mat = np.tile(test_data, (len(train_data), 1))
distance = ((test_mat - train_data) ** 2).sum(axis=1)
distance = distance ** 0.5
index_distance = distance.argsort()
class_dic = {}
for i in range(k):
data_label = train_label[index_distance[i]]
class_dic[data_label] = class_dic.get(data_label, 0) + 1
class_dic = sorted(class_dic.items(), key=operator.itemgetter(1), reverse=True)
return class_dic[0][0]
def read_file(filename):
'''
function:从txt文本数据---->arrary数据
两行红色注释,可以分别与其下面代码替换
:param filename:
:return:
'''
file = open(filename, encoding='utf-8')
file_list = file.readlines()
num_file = len(file_list)
num_feature = 3
feature_mat = np.zeros((num_file, num_feature))
label_mat = np.zeros(num_file)
# label_mat = []
index = 0
for line in file_list:
line = line.strip()
line_list = line.split('\t')
feature_mat[index, :] = line_list[0:num_feature]
label_mat[index] = line_list[num_feature]
# label_mat.append(int(line_list[num_feature]))
index = index + 1
return feature_mat, label_mat
def normal_data(data_mat):
m = data_mat.shape[0]
max_value = data_mat.max(axis=0)
min_value = data_mat.min(axis=0)
ranges = max_value - min_value
data_mat = (data_mat - np.tile(min_value, (m, 1))) / np.tile(ranges, (m, 1))
return data_mat, ranges, min_value
def class_test():
ratio = 0.10
training_data, train_label = read_file('datingTestSet1.txt')
train_data, ranges, min_value = normal_data(training_data)
num_train = train_data.shape[0]
num_test = int(ratio * num_train)
error_count = 0
for i in range(num_test):
class_result = class_dataset(train_data[i, :], train_data[num_test:, :],
train_label[num_test:], 3)
print('''The classifier came back with: {}, the real answer is :{}'''
.format(class_result, train_label[i]))
if class_result == train_label[i]:
pass
else:
error_count = error_count + 1
print('The total error rate is %2f' % (error_count / float(num_test)))
def class_person():
'''
误点:
:return:
'''
class_dic = {1: 'not at all',
2: 'in small doses',
3: 'in large doses'}
train_data, train_label = read_file('datingTestSet2.txt')
train_data, ranges, min_vale = normal_data(train_data)
fly = float(input('请输入每年获得的飞行常客里程数:'))
game_time = float(input('请输入每天玩视频游戏所消耗时间百分比:'))
ice_cream = float(input('请输入每周消费的冰淇淋公斤数:'))
test_data = np.array([fly, game_time, ice_cream])
test_data = (test_data - min_vale) / ranges
ret = class_dataset(test_data, train_data, train_label, 3)
return '系统提示:你对该名男子的感觉是:{}'.format(class_dic[ret])
def img_vector(filemane):
file = open(filemane)
num = 32
vector = np.zeros((1, num*num))
for i in range(num):
line = file.readline()
for j in range(num):
vector[0, 32*i+j] = int(line[j])
return vector
def hand_writing():
train_label = []
train_list = os.listdir('trainingDigits')
num_train = len(train_list)
train_data = np.zeros((num_train, 1024))
for i in range(num_train):
file = train_list[i]
train_data[i, :] = img_vector('trainingDigits/%s' % file)
file = file.strip().split('_')
label = int(file[0])
train_label.append(label)
test_list = os.listdir('testDigits')
error = 0.0
num_test = len(test_list)
for i in range(num_test):
file = test_list[i]
test_data = img_vector('testDigits/%s' % file)
file = file.strip().split('_')
test_label = int(file[0])
ret = class_dataset(test_data, train_data, train_label, 3)
if ret == test_label:
pass
else:
error = error + 1
print('''
The class result is: {}
The real answer is: {}'''.format(ret, test_label))
# print('''
# The class result is: {}
# The real answer is: {}'''.format(ret, test_label))
print('The total error rate is:{}'.format(error/float(num_test)))
代码调用:
#!/usr/bin/env python
# -*- Coding:UTF-8 -*-
import new_knn
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
'''
group, labels = new_knn.create_dataset()
ret = new_knn.class_dataset([2,3], group, labels,3)
print(ret)
'''
'''
# 作图
mydata, labels = new_knn.read_file('datingTestSet2.txt')
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.scatter(mydata[:, 1], mydata[:, 2], 15.0*np.array(labels), 15.0*np.array(labels))
plt.show()
'''
'''
mydata, labels = new_knn.read_file('datingTestSet2.txt')
norm_mat, ranges, min_vals = new_knn.normal_data(mydata)
'''
'''
# 测试
new_knn.class_test()
'''
'''
# 构建完整可用的系统
ret = new_knn.class_person()
print(ret)
'''
'''
ret = new_knn.img_vector('testDigits/0_13.txt')
print(ret)
'''
new_knn.hand_writing()