前几天搞数模的时候用到了马氏距离,借鉴了网上的代码,最后成功搞出来了,虽然建模效果不佳,但是先把代码放出来,以后也方便找。
import re
import linecache
import numpy as np
#合并数据函数
#将前三个文件存入第四个文件中
def merge(file1, file2,file3,file4):
with open(file1, "r", encoding='utf-8') as file1:
content = file1.read()
file1.close()
with open(file2, "r", encoding='utf-8') as file2:
content += file2.read()
file2.close()
with open(file3, "r", encoding='utf-8') as file3:
content += file3.read()
file3.close()
f4 = open(file4, "w", encoding='utf-8')
f4.write(content)
f4.close()
print(f4)
#样本分类函数
def sort_sample(filename):
print('\n'+ filename.split('/')[-1].split('.')[0].strip() + ':') #输出文件名
x, y=[], [] #定义两个空数组存储变量值
#将txt文件导入Python并存到上面两个空数组中
with open(filename, 'r') as f:
lines = f.readlines() #读取每一行存入lines中
for line in lines:
try:
temp = line.split() #默认以空格作为分隔符
x.append(float(temp[0]))
y.append(float(temp[1]))
except:
continue #可以跳过空行
xlen = len(x)
ylen = len(y)
print('The number of values: ', xlen)
#打印该文件中一共有多少对值
xtotal = 0
ytotal = 0
for xone in x:
xtotal += xone
xave = xtotal/xlen #求中心点的x值
for yone in y:
ytotal += yone
yave = ytotal/ylen #求中心点的y值
print("The center is: (" + str(xave) + ' , ' + str(yave) + ")")
#输出中心点坐标
#用马氏距离创建识别函数
def sort_test(filename):
dummy_x_A = 27.5 #这里用dummy作前缀是为了逃避检查变量是否合法
dummy_y_A = 1188.6
dummy_x_B = 42.5
dummy_y_B = 727.7
dummy_x_C = 27.5
dummy_y_C = 2038.1
print('\n', filename.split('/')[-1].split('.')[0].strip(),':') #输出文件名
x, y=[], [] #定义两个空数组存储变量值
#将txt文件导入Python并存到上面两个空数组中
with open(filename, 'r') as f:
lines = f.readlines() #读取每一行存入lines中
for line in lines:
try:
temp = line.split() #默认以空格作为分隔符
x.append(float(temp[0]))
y.append(float(temp[1]))
except:
continue #可以跳过空行
xlen = len(x)
ylen = len(y)
print('The number of values: ', xlen)
#打印该文件中一共有多少对值
xtotal = 0
ytotal = 0
for xone in x:
xtotal += xone
xave = xtotal/xlen #求中心点的x值
for yone in y:
ytotal += yone
yave = ytotal/ylen #求中心点的y值
print("The center is: (" + str(xave) + ' , ' + str(yave) + ")")
#输出中心点坐标
t1 = [dummy_x_A, dummy_x_B, dummy_x_C]
t2 = [dummy_y_A, dummy_y_B, dummy_y_C]
t1.append(xave)
t2.append(yave)
X=np.vstack([t1,t2]) #按垂直方向(行顺序)堆叠数组构成一个新的数组
print('Xi-Xj: \n',X)
XT=X.T #
print('(Xi-Xj)T: \n',XT)
#马氏距离要求样本数要大于维数,否则无法求协方差矩阵
#此处进行转置↑
S=np.cov(X) #两个维度之间协方差矩阵
SI = np.linalg.inv(S) #协方差矩阵的逆矩阵
#马氏距离计算两个样本之间的距离,此处共有4个样本,两两组合,共有6个距离。
n=XT.shape[0]
#矩阵(ndarray)的shape属性可以获取矩阵的形状(例如二维数组的行列),shape[0] = 行数,shape[1] = 列数
d1 = []
for i in range(0,n):
for j in range(i+1,n):
delta=XT[i]-XT[j]
d=np.sqrt(np.dot(np.dot(delta,SI),delta.T)) #dot函数是求矩阵乘积的结果
if j ==3:
print('The distance from X' + str(i+1) + ' to X' + str(j+1) + ' is: ', d)
d1.append(d)
#使用python自带的min()函数,找出d1这个列表中最小的值,最小值为0
dummy_min_value = min(d1)
#通过list.index(值)找出这个值在列表list中的索引值
index = d1.index(dummy_min_value)
if index == 0:
print("X4 is closest to X1.")
print("So this is A.")
d2.append(filename.split('/')[-1].split('.')[0].strip())
d3.append('A')
elif index == 1:
print("X4 is closest to X2.")
print("So this is B.")
d2.append(filename.split('/')[-1].split('.')[0].strip())
d3.append('B')
elif index == 2:
print("X4 is closest to X3.")
print("So this is C.")
d2.append(filename.split('/')[-1].split('.')[0].strip())
d3.append('C')
#开始导入文件进行运算
d2,d3 = [],[]
merge('D:/PYTHON学习/数模/sample1.txt','D:/PYTHON学习/数模/sample2.txt','D:/PYTHON学习/数模/sample3.txt','D:/PYTHON学习/数模/A.txt')
merge('D:/PYTHON学习/数模/sample4.txt','D:/PYTHON学习/数模/sample5.txt','D:/PYTHON学习/数模/sample6.txt','D:/PYTHON学习/数模/B.txt')
merge('D:/PYTHON学习/数模/sample7.txt','D:/PYTHON学习/数模/sample8.txt','D:/PYTHON学习/数模/sample9.txt','D:/PYTHON学习/数模/C.txt')
#注意使用各文件在电脑里面的确切位置,该处为代码编写者的存储位置
#'D:/PYTHON学习/数模/A.txt','D:/PYTHON学习/数模/B.txt','D:/PYTHON学习/数模/C.txt'为函数自动生成,建议修改存储路径
filenames = ['D:/PYTHON学习/数模/A.txt','D:/PYTHON学习/数模/B.txt','D:/PYTHON学习/数模/C.txt']
for filename in filenames:
sort_sample(filename)
filenames = ['D:/PYTHON学习/数模/sample10.txt',
'D:/PYTHON学习/数模/sample11.txt',
'D:/PYTHON学习/数模/sample12.txt',
'D:/PYTHON学习/数模/sample13.txt',
'D:/PYTHON学习/数模/sample14.txt',
'D:/PYTHON学习/数模/test1.txt',
'D:/PYTHON学习/数模/test2.txt',
'D:/PYTHON学习/数模/test3.txt',
'D:/PYTHON学习/数模/test4.txt',
'D:/PYTHON学习/数模/test5.txt',
'D:/PYTHON学习/数模/test6.txt',
'D:/PYTHON学习/数模/test7.txt',
'D:/PYTHON学习/数模/test8.txt',
'D:/PYTHON学习/数模/test9.txt',
'D:/PYTHON学习/数模/test10.txt',
'D:/PYTHON学习/数模/test11.txt',
'D:/PYTHON学习/数模/test12.txt',
'D:/PYTHON学习/数模/test13.txt',
'D:/PYTHON学习/数模/test14.txt']
for filename in filenames:
sort_test(filename)
d4 = np.vstack([d2,d3])
d4T=d4.T
print('\nFinal result:')
print(d4T)
以下是结果输出:
A:
The number of values: 6750
The center is: (27.5 , 1188.6226666666666)
B:
The number of values: 11250
The center is: (42.5 , 727.7841777777778)
C:
The number of values: 6750
The center is: (27.5 , 2038.1854814814815)
sample10 :
The number of values: 2250
The center is: (27.5 , 2063.2786666666666)
Xi-Xj:
[[ 27.5 42.5 27.5 27.5 ]
[1188.6 727.7 2038.1 2063.27866667]]
(Xi-Xj)T:
[[ 27.5 1188.6 ]
[ 42.5 727.7 ]
[ 27.5 2038.1 ]
[ 27.5 2063.27866667]]
The distance from X1 to X4 is: 2.1516104268159877
The distance from X2 to X4 is: 2.1317648071932003
The distance from X3 to X4 is: 0.06193666748473441
X4 is closest to X3.
So this is C.
省略。。。
————————————————————————
Final result:
[['sample10' 'C']
['sample11' 'A']
['sample12' 'B']
['sample13' 'B']
['sample14' 'C']
['test1' 'A']
['test2' 'C']
['test3' 'A']
['test4' 'B']
['test5' 'C']
['test6' 'C']
['test7' 'A']
['test8' 'C']
['test9' 'A']
['test10' 'C']
['test11' 'A']
['test12' 'C']
['test13' 'C']
['test14' 'A']]