版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/u014539580/article/details/78264423
from numpy import *
import matplotlib.pyplot as plt
#辅助函数
#载入数据集
def loadDataSet(filename):
dataMat = []
f = open(filename)
for line in f.readlines():
curLine = line.strip().split('\t')
#python3.之后需要用list(map())
fltLine = list(map(float,curLine))
dataMat.append(fltLine)
return dataMat
#返回两个点的欧氏距离
def distEclud(vecA,vecB):
return sqrt(sum(power(vecA-vecB,2)))
#构建一个包含k个随机质心的集合
def randCent(dataSet,k):
#获取每一位维的度数
n = shape(dataSet)[1]
#生成(k,n)维空数组矩阵
centroids = mat(zeros((k,n)))
#在minJ到maxJ之间生成随机质心填充 centroids
for j in range(n):
minJ = min(dataSet[:,j])
rangeJ = float(max(dataSet[:,j]) - minJ)
centroids[:,j] = minJ + rangeJ * random.rand(k,1)
return centroids
def kMeans(dataSet,k,dist = distEclud, createCent = randCent):
m = shape(dataSet)[0]
#长度为m的label数组
label = zeros((1,m))[0]
centroids = createCent(dataSet,k)
clusterChanged = True
while clusterChanged:
clusterChanged = False
#为每个点寻找最近的质心
for i in range(m):
minDist = inf; minIndex = -1;
for j in range(k):
distJI = dist(centroids[j,:],dataSet[i,:])
if distJI < minDist:
minDist = distJI;minIndex = j
if(label[i] != minIndex):
clusterChanged = True
label[i] = minIndex
print(centroids)
#重新计算质心的位置
for cent in range(k):
ptsInclust = dataSet[nonzero(label == cent)[0]]
centroids[cent,:] = mean(ptsInclust,axis = 0)
return centroids , label
if __name__ == '__main__':
k = 4
filename = 'testSet.txt'
dataSet = loadDataSet(filename)
dataArray = array(dataSet)
#dataMat = mat(loadDataSet(filename))
#plt.plot(dataArray[:,0],dataArray[:,1],'o')
centroids,label = kMeans(dataArray,k)
str = 'o*s^'
color = 'bgrc'
for i in range(len(label)):
ch = str[int(label[i])]
co = color[int(label[i])]
plt.plot(dataArray[i,0],dataArray[i,1],color =co ,marker = ch)
for i in range(len(centroids)):
plt.plot(centroids[i,0],centroids[i,1],'k+')