#outs:
D:\Anaconda3\envs\pcd\python.exe D:/机器学习/01K_Nearest.py
['爱情片']
Process finished with exit code 0
import pandas as pd
import numpy as np
#1.构建数据集
rowdata ={
'电影名称':['无问西东','后来的我们','前任3','红海行动','唐人街探案','战狼'],'打斗镜头':[1,5,12,108,112,115],'接吻镜头':[101,89,97,5,9,8],'电影类型':['爱情片','爱情片','爱情片','动作片','动作片','动作片']}
movie_data = pd.DataFrame(rowdata)#2.计算距离
new_data =[24,67]
dist =((movie_data.iloc[:6,1:3]- new_data)**2).sum(1)**0.5#3.将距离升序排序,然后选取距离最小的K个点
k =4
dist_l = pd.DataFrame({
'dist':dist,'labels':(movie_data.iloc[:6,3])})
dr = dist_l.sort_values(by ='dist')[:k]#排序#确定前k个点所在的类别的出现频率
re = dr.loc[:,'labels'].value_counts()
result =[]
result.append(re.index[0])#4.封装函数# 函数功能: KNN分类器# 参数说明:# inX:需要预测分类的数据集# dataSet:已知分类标签的数据集(训练集)# k:k-邻近算法参数,选择距离最小的k个点# 返回:# result: 分类结果defclassify0(inX,dataSet,k):
result =[]
dist =((movie_data.iloc[:6,1:3]- new_data)**2).sum(1)**0.5
dist_l = pd.DataFrame({
'dist': dist,'labels':(movie_data.iloc[:6,3])})
dr = dist_l.sort_values(by='dist')[:k]# 排序
re = dr.loc[:,'labels'].value_counts()
result =[]
result.append(re.index[0])return result
if __name__ =='__main__':
inX = new_data
dataSet = movie_data
k =4
result = classify0(inX,dataSet,k)print(result)
约会网站 KNN分类
散点图数据分析
代码片
# 1.导入数据集
datingTest = pd.read_table('dataset/datingTestSet.txt',header=None)
datingTest.head()# 2.分析数据
Colors =[]for i inrange(datingTest.shape[0]):
m = datingTest.iloc[i,-1]#提取最后一列,labelif m =='didntLike':
Colors.append('black')if m =='smallDoses':
Colors.append('orange')if m =='largeDoses':
Colors.append('red')#绘制特征散点分布图
plt.rcParams['font.sans-serif']=['Simhei']#图中字体设置为黑体
pl = plt.figure(figsize=(12,8))
fig1 = pl.add_subplot(221)
plt.scatter(datingTest.iloc[:,1],datingTest.iloc[:,2],marker='.',c=Colors)
plt.xlabel('玩游戏视频所占时间比')
plt.ylabel('每周消费冰淇淋的公升数')
fig2 = pl.add_subplot(222)
plt.scatter(datingTest.iloc[:,0],datingTest.iloc[:,1],marker='.',c=Colors)
plt.xlabel('每年飞行常客里程')
plt.ylabel('玩游戏视频所占时间比')
fig3 = pl.add_subplot(223)
plt.scatter(datingTest.iloc[:,0],datingTest.iloc[:,2],marker='.',c=Colors)
plt.xlabel('每年飞行常客里程')
plt.ylabel('每周消费冰淇淋的公升数')
全部代码
D:\Anaconda3\envs\pcd\python.exe D:/机器学习/01K_Nearest.py
模型预测准确率为0.92
D:/机器学习/01K_Nearest.py:26: SettingWithCopyWarning:
A value is trying to be set on a copy of a slicefrom a DataFrame.
Try using .loc[row_indexer,col_indexer]= value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
test['predict']= result
Process finished with exit code 0
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
#4.封装函数# 函数功能: KNN分类器# 参数说明:# inX:需要预测分类的数据集# dataSet:已知分类标签的数据集(训练集)# k:k-邻近算法参数,选择距离最小的k个点# 返回:# result: 分类结果defdatingClass(train,test,k):
n = train.shape[1]-1
m = test.shape[0]
result =[]for i inrange(m):
dist =list((((train.iloc[:,:n]- test.iloc[i,:n])**2).sum(1))**0.5)
dist_l = pd.DataFrame({
'dist': dist,'labels':(train.iloc[:, n])})
dr = dist_l.sort_values(by ='dist')[: k]# 查找4个最临近的点
re = dr.loc[:,'labels'].value_counts()# 统计各个label的个数
result.append(re.index[0])# 将概率最大的label 添加到result中
result = pd.Series(result)
test['predict']= result
acc =(test.iloc[:,-1]==test.iloc[:,-2]).mean()print(f'模型预测准确率为{acc}')return test
#归一化defminmax(dataSet):
minDf = dataSet.min()
maxDf = dataSet.max()
normSet =(dataSet - minDf)/(maxDf - minDf)return normSet
defrandSplit(dataset, rate=0.9):
n = dataset.shape[0]
m =int(n*rate)
train = dataset.iloc[:m,:]
test = dataset.iloc[m:,:]
test.index =range(test.shape[0])return train,test
if __name__ =='__main__':# 1.导入数据集
datingTest = pd.read_table('dataset/datingTestSet.txt',header=None)
datingTest.head()# 2.分析数据
Colors =[]for i inrange(datingTest.shape[0]):
m = datingTest.iloc[i,-1]#提取最后一列,labelif m =='didntLike':
Colors.append('black')if m =='smallDoses':
Colors.append('orange')if m =='largeDoses':
Colors.append('red')#绘制特征散点分布图
plt.rcParams['font.sans-serif']=['Simhei']#图中字体设置为黑体
pl = plt.figure(figsize=(12,8))
fig1 = pl.add_subplot(221)
plt.scatter(datingTest.iloc[:,1],datingTest.iloc[:,2],marker='.',c=Colors)
plt.xlabel('玩游戏视频所占时间比')
plt.ylabel('每周消费冰淇淋的公升数')
fig2 = pl.add_subplot(222)
plt.scatter(datingTest.iloc[:,0],datingTest.iloc[:,1],marker='.',c=Colors)
plt.xlabel('每年飞行常客里程')
plt.ylabel('玩游戏视频所占时间比')
fig3 = pl.add_subplot(223)
plt.scatter(datingTest.iloc[:,0],datingTest.iloc[:,2],marker='.',c=Colors)
plt.xlabel('每年飞行常客里程')
plt.ylabel('每周消费冰淇淋的公升数')
plt.show()# 3.数据集的归一化处理
datingT = pd.concat([minmax(datingTest.iloc[:,:3]),datingTest.iloc[:,3]], axis=1)#减少个别特征的权重影响
train,test = randSplit(datingT)# 0.9 训练集 0.1 测试集
datingClass(train=train,test=test,k=4)