数据分类
对未知类别属性的数据集合中的每个点以此执行以下操作:
1. 计算已知类别数据集中的点与当前点之间的距离;
2. 按照距离递增依次排序;
3. 选取与当前点距离最小的k个点;
4. 确定前k个点所在类别出现的频率;
5. 返回前k个点出现频率最高的类别作为当前点的预测分类;
def classify0(in_x, data_set, labels, k):
data_set_size = data_set.shape[0]
diff_mat = tile(in_x, (data_set_size,1)) - data_set
sq_diff_mat = diff_mat**2
sq_distances = sq_diff_mat.sum(axis=1)
distances = sq_distances**0.5
sorted_dist_indicies = distances.argsort()
class_count = {}
for i in range(k):
voteIlabel = labels[sorted_dist_indicies[i]]
class_count[voteIlabel] = class_count.get(voteIlabel, 0) + 1
sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True)
return sorted_class_count[0][0]
in_x :用于分类的的输入向量
data_set :输入的训练样本集
labels :标签向量
.sum(axis=1)
:对于二维数组,axis=1表示按行相加 , axis=0表示按列相加
tile(A,rep)
:重复A的各个维度A: Array类的都可以
rep:A沿着各个维度重复的次数
argsort()函数是将x中的元素从小到大排列,提取其对应的index(索引),然后输出到y
D.get(key[,default=None])
key -- 字典中要查找的键
default -- 可选参数,如果指定键的值不存在时,返回该值,默认为 None。
sort 与 sorted 区别:sort 是应用在 list 上的方法,sorted 可以对所有可迭代的对象进行排序操作。
list 的 sort 方法返回的是对已经存在的列表进行操作,无返回值,而内建函数 sorted 方法返回的是一个新的 list,而不是在原来的基础上进行的操作。
sorted(iterable[, cmp[, key[, reverse]]])
iterable – 可迭代对象
cmp – 比较的函数,这个具有两个参数,参数的值都是从可迭代对象中取出,此函数必须遵守的规则为,大于则返回1,小于则返回-1,等于则返回0
key – 主要是用来进行比较的元素,只有一个参数,具体的函数的参数就是取自于可迭代对象中,指定可迭代对象中的一个元素来进行排序。
reverse – 排序规则,reverse = True 降序 , reverse = False 升序(默认)
operator模块提供的itemgetter函数用于获取对象的哪些维的数据,参数为一些序号
字典(Dictionary) items() 函数以列表返回可遍历的(键, 值) 元组数组
from numpy import *
import operator
import os
import sys
def creat_data_set():
group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]])
labels = ['A', 'A', 'B', 'B']
return group, labels
def classify0(in_x, data_set, labels, k):
data_set_size = data_set.shape[0]
diff_mat = tile(in_x, (data_set_size,1)) - data_set
sq_diff_mat = diff_mat**2
sq_distances = sq_diff_mat.sum(axis=1)
distances = sq_distances**0.5
sorted_dist_indicies = distances.argsort()
class_count = {}
for i in range(k):
voteIlabel = labels[sorted_dist_indicies[i]]
class_count[voteIlabel] = class_count.get(voteIlabel, 0) + 1
sorted_class_count = sorted(class_count.items(), key=operator.itemgetter(1), reverse=True)
return sorted_class_count[0][0]
def file_to_matrix(filename):
fr = open(filename)
#array_lines = fr.readlines()
number_of_lines = len(fr.readlines())
return_mat = zeros((number_of_lines, 3))
class_label_vector = []
fr = open(filename)
index = 0
for line in fr.readlines():
line = line.strip()
list_from_line = line.split('\t')
return_mat[index, :] = list_from_line[0:3]
class_label_vector.append(int(list_from_line[-1]))
index += 1
'''#text
print(return_mat)
print(class_label_vector)
'''
return return_mat, class_label_vector
def auto_norm(data_set):
min_val = data_set.min(0)
max_val = data_set.max(0)
ranges = max_val - min_val
norm_data_set = zeros(shape(data_set))
m = data_set.shape[0]
norm_data_set = data_set - tile(min_val, (m, 1))
norm_data_set = norm_data_set / tile(ranges, (m, 1))
return norm_data_set, ranges, min_val
def dating_class_test():
ho_ratio = 0.10
dating_data_mat, dating_labels = file_to_matrix('datingTestSet2.txt')
'''#text
print(dating_data_mat)
print(dating_labels)
'''
norm_mat, ranges, min_val = auto_norm(dating_data_mat)
m = norm_mat.shape[0]
num_test_vecs = int(m*ho_ratio)
error_count = 0.0
for i in range(num_test_vecs):
classifier_result = classify0(norm_mat[i, :], norm_mat[num_test_vecs:m, :], dating_labels[num_test_vecs:m], 3)
print('the classifier came back with: %d, the real answer is: %d'%(classifier_result, dating_labels[i]))
if classifier_result != dating_labels[i] :
error_count += 1.0
print('the total error rate is : %f'%(error_count/float(num_test_vecs)))
def classify_person():
result_list = ['not at all', 'in small doses', 'in large doses']
tats = float( input('Time in video games: '))
miles = float(input('frequent filer miles: '))
cream = float(input('liters of ice cream: '))
dating_data_mat, dating_labels = file_to_matrix('datingTestSet2.txt')
norm_mat, ranges, min_val = auto_norm(dating_data_mat)
in_arr = array([miles, tats, cream])
classifier_result = classify0((in_arr-min_val)/ranges, norm_mat, dating_labels, 3)
print('Classifier result : ', result_list[classifier_result])
def img_to_vector(filename):
return_vector = zeros((1, 1024))
fr = open(filename)
for i in range(32):
line_str = fr.readline()
for j in range(32):
return_vector[0, 32*i+j] = int(line_str[j])
return return_vector
def hand_writing_class_test():
hand_writing_labels = []
traing_file_list = os.listdir('trainingDigits')
m = len(traing_file_list)
training_mat = zeros((m, 1024))
for i in range(m):
file_name_str = traing_file_list[i]
file_str = file_name_str.split('.')[0]
class_num_str = int(file_str.split('_')[0])
hand_writing_labels.append(class_num_str)
training_mat[i, :] = img_to_vector('trainingDigits/{}'.format(file_name_str))
test_file_list = os.listdir('testDigits')
error_count = 0.0
mtest = len(test_file_list)
for i in range(mtest):
file_name_str = test_file_list[i]
file_str = file_name_str.split('.')[0]
class_num_str = int(file_str.split('_')[0])
vector_under_test = img_to_vector('testDigits/{}'.format(file_name_str))
classifier_result = classify0(vector_under_test, training_mat, hand_writing_labels, 3)
print('thr classifier came back is %d, the real answer is %d'%(classifier_result, class_num_str))
if( classifier_result != class_num_str ):
error_count += 1.0
print('\nThe total number of errors is %d '%error_count)
print('\nThe total error rate is %f'%(error_count/float(mtest)))
if __name__ == '__main__':
#dating_class_test()
#classify_person()
hand_writing_class_test()