KMeans 非skleran 的实现代码
# -*- coding: utf-8 -*-
"""
Created on Thu May 10 10:12:53 2018
@author: lizihua
"""
from numpy import *
import matplotlib.pyplot as plt
# 加载数据
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine))
dataMat.append(fltLine)
return dataMat
# 计算欧式距离
def distEclud(vecA, vecB):
return sqrt(sum(power(vecA - vecB, 2)))
# 随机选取k个簇的质心
def randCent(dataSet, k):
n = shape(dataSet)[1]
centroids = mat(zeros((k, n)))
for j in range(n):
minJ = min(dataSet[:, j])
rangeJ = float(max(dataSet[:, j]) - minJ)
# random.rand(k,1)随机产生k行1列的array,数组的值的范围:[0,1)
# min+(max-min)*(0,1)之间的数,保证了质心在数据集边界之内
centroids[:, j] = minJ + rangeJ * random.rand(k, 1)
return centroids
# K-means聚类算法
def kMeans(dataSet, k, distM=distEclud, createCent=randCent):
m = shape(dataSet)[0]
# 创建矩阵来存储每个点的簇的分配结果,第一列记录簇索引值,第二列存储误差(点到簇质心的距离)
# 且默认所有点的簇都为0
clusterAssment = mat(zeros((m, 2)))
# 随机选取k个簇质心
centroids = createCent(dataSet, k)
# 标志变量clusterChanged,用以标志簇分配结果是否发生变化
clusterChanged = True
while clusterChanged:
clusterChanged = False
# 对于每一个点,计算每个点与k个簇质心的距离,并每个点距离k个簇质心中的最小距离的那个簇
for i in range(m):
minDist = inf
minIndex = -1
for j in range(k):
distJI = distM(centroids[j, :], dataSet[i, :])
if distJI < minDist:
minDist = distJI
minIndex = j
# 如果簇索引发生变化,则,标志变量clusterChanged为True
if clusterAssment[i, 0] != minIndex:
clusterChanged = True
# 反之,则将离该点最近的簇索引和距离的平方存储到clusterAssment
clusterAssment[i, :] = minIndex, minDist ** 2
print("centroids:\n", centroids)
# 遍历所有簇质心
for cent in range(k):
# 通过数组过滤来获得给定簇的所有点
pstInClust = dataSet[nonzero(clusterAssment[:, 0].A == cent)[0]]
# 计算所有点的均值
centroids[cent, :] = mean(pstInClust, axis=0)
# 返回类质心和分配结果
return centroids, clusterAssment
def kMeansPlot(dataMat, centroids, clusterAssment):
k = len(centroids)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(centroids[:, 0].tolist(), centroids[:, 1].tolist(), marker='+', c='r')
markers = ['o', 's', 'v', '*']
colors = ['blue', 'green', 'yellow', 'red']
for i in range(k):
data_class = dataMat[nonzero(clusterAssment[:, 0].A == i)[0]]
ax.scatter(data_class[:, 0].tolist(), data_class[:, 1].tolist(), marker=markers[i], c=colors[i])
plt.show()
# 测试
if __name__ == "__main__":
dataMat = mat(loadDataSet('testset.txt'))
# print("簇质心:\n", randCent(dataMat, 2))
# print("距离:\n", distEclud(dataMat[0], dataMat[1])) # 距离: 5.18463281668
myCentroids, clustAssing = kMeans(dataMat, 4)
# print("类质心:\n", myCentroids)
# print("点分配结果:\n", clustAssing)
kMeansPlot(dataMat, myCentroids, clustAssing)