判断
物以类聚,人以群分。
1. 创建坐标系衡量
2. 计算距离来表现相似度,大致判断出主体类型
就是这样了。
不过加入一个条件,选取周边的范围。因为范围太大的话,实际上就取决于样本中数据的比例了。
汉奸接触的太君不多,良民很少,不过他还是汉奸啊。
样式
总体流程就是这样,封装了之后就只剩下逻辑拼装了,细节全屏蔽。
from sklearn import neighbors from sklearn import datasets # 模型 knn = neighbors.KNeighborsClassifier() # 数据 irs = datasets.load_iris() # 训练 knn.fit(irs.data, irs.target) # 测试 predictLabel = knn.predict([[0.1, 0.2, 0.3, 0.4]]) # 结果 print(irs.target_names[predictLabel])
自定义
还是自己动下手。
import csv import math from collections import Counter from functools import reduce # knn model class knn_model(object): def __init__(self): """ initial params """ # 比较范围 self.around = None # 加载的数据 self.data = None # 排序距离表 self.sort_distance = None # 原生距离表 self.distance_table = None def load_data(self, path): """ read file and parse data """ file = list(csv.reader(open(path, 'r'))) data_body = file[1:] target_type = 0 feature_name = data_body[0] feature = [] target = [] target_name = [] for line in data_body: single = [] for part in line[0:-1]: single.append(float(part)) feature.append(single) label = line[-1] if label in target_name: target.append(target_name.index(label)) else: target_name.append(label) target.append(target_type) target_type += 1 self.data = {"target": target, "target_name": target_name, "feature": feature, "feature_name": feature_name} @staticmethod def distance(one, two): """ calc the distance between the two point """ result = 0 for index in range(len(one)): result += math.pow(math.fabs(one[index] - two[index]), 2) return math.sqrt(result) def make_distance_table(self, test): """ save the information of distance with sequence as index """ result = [] data_body = self.data['feature'] target_name = self.data['target_name'] target = self.data['target'] for index in range(len(data_body)): train = data_body[index] train_label = target_name[target[index]] dis = self.distance(train, test) result.append((dis, train_label)) self.distance_table = result def data_filter(self): """ filter the distance """ self.sort_distance = sorted(self.distance_table, key=lambda x: x[0]) filter_data = [x for x in self.distance_table if x[0] <= self.around] count_data = dict(Counter(filter_data)) result = reduce(lambda key1, key2: count_data[key1] >= count_data[key2] and key1 or key2, count_data) return result[1] def boundary(self, boundary): """ set the boundary ,default compare all data """ self.around = boundary return self def predict(self, test): """ test the data """ self.make_distance_table(test) return self.data_filter() def example(): """ how to use """ # build a model model = knn_model() # load the data model.load_data('knn.csv') # set the boundary model.boundary(10) # test the data result = model.predict([5.1, 3.5, 1.4, 0.2]) # you can do this as : model.boundary(value).predict(data) print(result) if __name__ == '__main__': """ run example """ example()
数据还是irs,不想整理的话这里有:https://download.csdn.net/download/wait_for_eva/10366588
可以的话自己从dataset.load_irs中整理出来,更多锻炼。
而且没有零分资源了,自己动手最好了。
import csv import math from collections import Counter from functools import reduce from random import choice # knn model class knn_model(object): def __init__(self): """ initial params """ # 比较范围 self.filter_type = "top" self.filter_boundary = 10 # 加载的数据 self.data = None # 排序距离表 self.sort_distance = None # 原生距离表 self.distance_table = None self.filter_model = "normal" self.boundary_count = None def load_data(self, path): """ read file and parse data """ file = list(csv.reader(open(path, 'r'))) data_body = file[1:] target_type = 0 feature_name = data_body[0] feature = [] target = [] target_name = [] for line in data_body: single = [] for part in line[0:-1]: single.append(float(part)) feature.append(single) label = line[-1] if label in target_name: target.append(target_name.index(label)) else: target_name.append(label) target.append(target_type) target_type += 1 self.data = {"target": target, "target_name": target_name, "feature": feature, "feature_name": feature_name} @staticmethod def distance(one, two): """ calc the distance between the two point """ result = 0 for index in range(len(one)): result += math.pow(math.fabs(one[index] - two[index]), 2) return math.sqrt(result) def make_distance_table(self, test): """ save the information of distance with sequence as index """ result = [] data_body = self.data['feature'] target_name = self.data['target_name'] target = self.data['target'] for index in range(len(data_body)): train = data_body[index] train_label = target_name[target[index]] dis = self.distance(train, test) result.append((dis, train_label)) self.distance_table = result def data_filter(self): """ filter the distance """ self.sort_distance = sorted(self.distance_table, key=lambda x: x[0]) filter_data = self.filter_type == 'top' and self.top_filter() or self.distance_filter() self.boundary_count = dict(Counter(filter_data)) return self.filter_model == "normal" and self.normal_choose() or self.weight_choose() def top_filter(self): """ filter the data by the num of the top """ return [x[1] for x in self.sort_distance[:self.filter_boundary]] def distance_filter(self): """ filter the num by distance """ return [x[1] for x in self.distance_table if x[0] <= self.filter_boundary] def filter_by(self, filter_type, filter_boundary): """ set filter type """ if not filter_type == 'top' or filter_type == 'distance': raise Exception('filter_type {} was not supported ') self.filter_type = filter_type self.filter_boundary = filter_boundary return self def normal_choose(self): """ get result by max """ result = reduce(lambda key1, key2: self.boundary_count[key1] >= self.boundary_count[key2] and key1 or key2, self.boundary_count) return result def weight_choose(self): """ get result by random with weight """ result_pool = [] print(self.boundary_count) for label, weight in self.boundary_count.items(): for index in range(weight): result_pool.append(label) return choice(result_pool) def choose_mode(self, model): """ how to get the result """ if model == 'normal' or model == 'weight': self.filter_model = model return self raise Exception('model {} was not supported'.format(model)) def boundary(self, boundary): """ set the boundary ,default compare all data """ self.filter_boundary = boundary return self def predict(self, test): """ test the data """ self.make_distance_table(test) return self.data_filter() def example(): """ how to use """ # build a model model = knn_model() # load the data model.load_data('knn.csv') # set the boundary model.boundary(10) # model.choose_mode("weight") # test the data result = model.predict([5.1, 3.5, 1.4, 0.2]) # you can do this as : model.boundary(value).predict(data) # choose a filter model : model.boundary(value).filter_model(filter_model).predict(data) print(result) if __name__ == '__main__': """ run example """ example()
出于两点考虑,在代码中加入了数据筛选和结果选择两部分
1. 距离把控
2. 结果随机
对于距离的长短没有一个准确的把握的话,容易造成参数设定的笑话。
如果数据集中的最长距离是5,如果设置成了10,那选取的对比的数据集就是整体数据,所谓部分判定也就不存在了。
相对的,如果最小距离是5,你设置成了10,周围根本没有参考的点,这样就毫无意义。
所以多了一个数据过滤模式filter_model,默认情况下选择的是top,filter_boundary表示取最近的几个点。
想要自己选择的话选择distance,filter_boundary表示距离的范围。
对于结果而言,结果集中经常不止一个结果。如果只是单凭数量来进行判断,那些比重较小的可能就变成完全没可能了。
同时,如果数量都一致的话,后来者必定居上,前面的大佬死的毫无声息。
因此添加了按照权重进行结果输出的选择方式。
前面两点弥补了少许的缺陷,同时,参数设定返回self,这样就可以进行链式编程,简化代码了----用的时候。