def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] diffMat = np.tile(inX, (dataSetSize,1)) - dataSet sqDiffMat = diffMat**2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances**0.5 sortedDistIndicies = distances.argsort() #print(sortedDistIndicies) #print(len(sortedDistIndicies)) classCount={} for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] voteIlabel=voteIlabel[0]####这一步不做就会报错 classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #print('step:',i,' voteIlabel:',voteIlabel) #print(classCount) sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) #print('result:',sortedClassCount[0][0]) return sortedClassCount[0][0]
def autoNorm(dataSet): minVals = dataSet.min(axis=0) maxVals = dataSet.max(axis=0) ranges = maxVals - minVals normDataSet = np.zeros(np.shape(dataSet)) m = dataSet.shape[0] normDataSet = dataSet - np.tile(minVals, (m,1)) normDataSet = normDataSet/np.tile(ranges, (m,1)) #element wise divide return normDataSet, ranges, minVals
def TrafficClassTest(): normMat, ranges, minVals = autoNorm(data_xnew) normMat1, ranges1, minVals1 = autoNorm(Data_xnew) numTestVecs = Data_ynew.shape[0] errorCount = 0.0 for i in range(numTestVecs): classifierResult = classify0(normMat1[i,:],normMat,data_ynew,3) print ("the classifier came back with: %d, the real answer is: %d" % (classifierResult,Data_ynew[i])) if (classifierResult != Data_ynew[i]): errorCount += 1.0 print ("the total error rate is: %f" % (errorCount/float(numTestVecs)))
表1 识别率分布图
标签\预测 |
1:步行 |
2:自行车 |
3:公交车 |
4:小汽车 |
求和 |
1:步行 |
0.9767 |
0.0021 |
0.0163 |
0.0049 |
1.0000 |
2:自行车 |
0.0024 |
0.9904 |
0.0072 |
0.0000 |
1.0000 |
3:公交车 |
0.0230 |
0.0000 |
0.9641 |
0.0129 |
1.0000 |
4:小汽车 |
0.0138 |
0.0000 |
0.0453 |
0.9409 |
1.0000 |
附录程序源代码:
#-*- coding=utf-8 -*- ##一次平滑用于预测 import pandas as pd import numpy as np import matplotlib.pyplot as plt from collections import Counter import operator ##定义用于将多维list整合成为1维list的函数 def flat(l): for k in l: if not isinstance(k, (list, tuple)): yield k else: yield from flat(k) SCALE = 60#定义SCALE长度,滚动每段数据实际为2SCALE长度 data_x=[] data_y=[]#训练集 Data_x=[] Data_y=[]#测试集 ##定义文件读入函数 def choose_file(filenumber): traindata = pd.read_excel('train%s.xlsx'%filenumber) tempspeed=[] tempfirstplant=[] tempmeans=[] tempspeed.append(list(traindata['speed'])) tempmeans.append(list(traindata['means'])) tempfirstplant.append(list(traindata['firstplant'])) tempspeed = list(flat(tempspeed)) tempmeans = list(flat(tempmeans)) tempfirstplant=list(flat(tempfirstplant)) for i in range(SCALE):#每个文件前SCALE个数据的读入 speed=np.array(tempspeed[0 : i+SCALE]) speed_in=tempspeed[i] firstplant=tempfirstplant[i] mean_speed=speed.mean() max_speed=speed.max() std_speed=speed.std() data_x.append(list([speed_in,firstplant,mean_speed,max_speed,std_speed])) data_y.append(list([tempmeans[i]])) for i in range(SCALE,(len(tempspeed)-SCALE)):#每个文件可以正常选择SCALE的数据的读入 speed = np.array(tempspeed[i-SCALE : i+SCALE]) speed_in=tempspeed[i] firstplant=tempfirstplant[i] mean_speed=speed.mean() max_speed=speed.max() std_speed=speed.std() data_x.append(list([speed_in,firstplant,mean_speed,max_speed,std_speed])) data_y.append(list([tempmeans[i]])) for i in range((len(tempspeed)-SCALE),len(tempspeed)):#每个文件尾部的SCALE个数据的读入 speed=np.array(tempspeed[i-SCALE:(len(tempspeed))]) speed_in=tempspeed[i] firstplant=tempfirstplant[i] mean_speed=speed.mean() max_speed=speed.max() std_speed=speed.std() data_x.append(list([speed_in,firstplant,mean_speed,max_speed,std_speed])) data_y.append(list([tempmeans[i]])) return data_x,data_y#返回用于训练模型的x和y ##读入5个出行个体的训练文件 l=[1,2,3,4,5,6] for i in l: choose_file(i) data_y=np.array(data_y) import random a=range((len(data_x))) b=random.sample(a,int(0.8*len(data_x))) c=list(set(a).difference(set(b))) data_xnew=[] data_ynew=[] Data_xnew=[] Data_ynew=[] for i in b: data_xnew.append(data_x[i]) data_ynew.append(data_y[i]) for i in c: Data_xnew.append(data_x[i]) Data_ynew.append(data_y[i]) data_xnew=np.array(data_xnew) data_ynew=np.array(data_ynew) Data_xnew=np.array(Data_xnew) Data_ynew=np.array(Data_ynew) def classify0(inX, dataSet, labels, k): dataSetSize = dataSet.shape[0] diffMat = np.tile(inX, (dataSetSize,1)) - dataSet sqDiffMat = diffMat**2 sqDistances = sqDiffMat.sum(axis=1) distances = sqDistances**0.5 sortedDistIndicies = distances.argsort() #print(sortedDistIndicies) #print(len(sortedDistIndicies)) classCount={} for i in range(k): voteIlabel = labels[sortedDistIndicies[i]] voteIlabel=voteIlabel[0]####这一步不做就会报错 classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 #print('step:',i,' voteIlabel:',voteIlabel) #print(classCount) sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) #print('result:',sortedClassCount[0][0]) return sortedClassCount[0][0] def autoNorm(dataSet): minVals = dataSet.min(axis=0) maxVals = dataSet.max(axis=0) ranges = maxVals - minVals normDataSet = np.zeros(np.shape(dataSet)) m = dataSet.shape[0] normDataSet = dataSet - np.tile(minVals, (m,1)) normDataSet = normDataSet/np.tile(ranges, (m,1)) #element wise divide return normDataSet, ranges, minVals CLASSFIERRESULT=[] def TrafficClassTest(): normMat, ranges, minVals = autoNorm(data_xnew) normMat1, ranges1, minVals1 = autoNorm(Data_xnew) numTestVecs = Data_ynew.shape[0] errorCount = 0.0 for i in range(numTestVecs): classifierResult = classify0(normMat1[i,:],normMat,data_ynew,3) CLASSFIERRESULT.append(classifierResult) #print ("the classifier came back with: %d, the real answer is: %d" % (classifierResult, Data_ynew[i])) if (classifierResult != Data_ynew[i]): errorCount += 1.0 print ("the total error rate is: %f" % (errorCount/float(numTestVecs))) print(errorCount,numTestVecs) TrafficClassTest() CLASSFIERRESULT = np.array(CLASSFIERRESULT) #Data_ynew标签(真实)true;CLASSFIERRESULT预测pre count1,count2,count3,count4=0,0,0,0 true_walk = Data_ynew[Data_ynew==1].shape[0] true_walk_index = np.argwhere(Data_ynew==1)[:,0] for o in true_walk_index: if CLASSFIERRESULT[o]==1: count1+=1 elif CLASSFIERRESULT[o]==2: count2+=1 elif CLASSFIERRESULT[o]==3: count3+=1 elif CLASSFIERRESULT[o]==4: count4+=1 else:pass w_w,w_b,w_s,w_c=count1/true_walk,count2/true_walk,count3/true_walk,count4/true_walk count1,count2,count3,count4=0,0,0,0 true_bike = Data_ynew[Data_ynew==2].shape[0] true_bike_index = np.argwhere(Data_ynew==2)[:,0] for o in true_bike_index: if CLASSFIERRESULT[o]==1: count1+=1 elif CLASSFIERRESULT[o]==2: count2+=1 elif CLASSFIERRESULT[o]==3: count3+=1 elif CLASSFIERRESULT[o]==4: count4+=1 else:pass b_w,b_b,b_s,b_c=count1/true_bike,count2/true_bike,count3/true_bike,count4/true_bike count1,count2,count3,count4=0,0,0,0 true_bus = Data_ynew[Data_ynew==3].shape[0] true_bus_index = np.argwhere(Data_ynew==3)[:,0] for o in true_bus_index: if CLASSFIERRESULT[o]==1: count1+=1 elif CLASSFIERRESULT[o]==2: count2+=1 elif CLASSFIERRESULT[o]==3: count3+=1 elif CLASSFIERRESULT[o]==4: count4+=1 else:pass s_w,s_b,s_s,s_c=count1/true_bus,count2/true_bus,count3/true_bus,count4/true_bus count1,count2,count3,count4=0,0,0,0 true_car = Data_ynew[Data_ynew==4].shape[0] true_car_index = np.argwhere(Data_ynew==4)[:,0] for o in true_car_index: if CLASSFIERRESULT[o]==1: count1+=1 elif CLASSFIERRESULT[o]==2: count2+=1 elif CLASSFIERRESULT[o]==3: count3+=1 elif CLASSFIERRESULT[o]==4: count4+=1 else:pass c_w,c_b,c_s,c_c=count1/true_car,count2/true_car,count3/true_car,count4/true_car print('line1',w_w,w_b,w_s,w_c) print('line2',b_w,b_b,b_s,b_c) print('line3',s_w,s_b,s_s,s_c) print('line4',c_w,c_b,c_s,c_c)