- 题目描述
- 编程实现K-means算法对waveform数据进行聚类,并对无噪声得图像进行分割;
- 编程实现PAM算法对有20%高斯噪声的waveform数据聚类,并对有噪声得图像进行分割。
- 算法描述
(1) K -means
(2) PAM
- 结果展示
1.K-means算法对waveform数据进行聚类(选取数据集的第7维和第10维作为x轴y轴进行可视化)
2.K-means算法对有20%高斯噪声的waveform数据进行聚类(选取数据集的第7维和第10维作为x轴y轴进行可视化)
3.PAM算法对waveform数据进行聚类(选取数据集的第7维和第10维作为x轴y轴进行可视化)
4.PAM算法对有20%高斯噪声的waveform数据进行聚类(选取数据集的第7维和第10维作为x轴y轴进行可视化)
• 图像分割部分
无噪声原图 有噪声原图
5.K-means算法对无噪声得图像进行分割
6.K-means算法对有噪声得图像进行分割
7.PAM算法对无噪声得图像进行分割
8.PAM算法对有噪声得图像进行分割
- 实验代码
-
# -*- coding:utf-8 -*- from numpy import * import pandas as pd import matplotlib.pyplot as plt import random import numpy as np from PIL import Image def image_gauss_noise(image): # 图片添加高斯噪声 img = image.astype(np.int16) # 此步是为了避免像素点小于0,大于255的情况 for i in range(img.shape[0]): for j in range(img.shape[1]): img[i, j] += random.gauss(mu=0, sigma=10) img[img > 255] = 255 img[img < 0] = 0 img = img.astype(np.uint8) return img def data_gauss_noise(data): # 10%数据添加高斯噪声 m, n = shape(data) msample = set((m * np.random.rand(int(m * 0.2))).astype(int)) for i in msample: for j in range(n): data[i, j] += random.gauss(mu=0, sigma=0.1) return data # 计算两个向量的距离,欧式距离 def disMea(vecA, vecB): return sqrt(sum(power(vecA - vecB, 2))) # 随机选择中心点 def createCent(dataSet, k): n = shape(dataSet)[1] centriods = mat(zeros((k, n))) for j in range(n): minJ = min(dataSet[:, j]) rangeJ = float(max(array(dataSet)[:, j]) - minJ) centriods[:, j] = minJ + rangeJ * np.random.rand(k, 1) return centriods def kmeans(dataSet, k): m = shape(dataSet)[0] clusterA = mat(zeros((m, 1))) centriods = createCent(dataSet, k) clusterC = True itr = 10 while clusterC and itr: clusterC = False for i in range(m): minDist = inf minIndex = -1 for j in range(k): distJI = disMea(centriods[j, :], dataSet[i, :]) if distJI < minDist: minDist = distJI; minIndex = j if clusterA[i, 0] != minIndex: clusterC = True clusterA[i, 0] = int(minIndex) for cent in range(k): ptsInClust = dataSet[nonzero(clusterA[:, 0].A == cent)[0]] # get all the point in this cluster centriods[cent, :] = mean(ptsInClust, axis=0) # assign centroid to mean # print(itr) itr -= 1 return centriods, clusterA def show1(dataSet, k, centriods, clusterA, count): plt.figure() m, n = shape(dataSet) mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr'] for i in range(m): markIndex = int(clusterA[i, 0]) plt.plot(dataSet[i, 6], dataSet[i, 9], mark[markIndex]) # mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb'] # for i in range(k): # plt.plot(centriods[i, showindex[6]], centriods[i, showindex[9]], mark[i], markersize=12) plt.savefig("Figure_"+str(count)+".png") def pearson_distance(vector1, vector2): from scipy.spatial.distance import pdist X = vstack([vector1, vector2]) return pdist(X) def totalcost(blogwords, medoids_idx): distances_cache = { } size = shape(blogwords)[0] total_cost = 0.0 medoids = { } for idx in medoids_idx: medoids[idx] = [] for i in range(size): choice = None min_cost = inf for m in medoids: tmp = distances_cache.get((m, i), None) if tmp == None: tmp = pearson_distance(blogwords[m], blogwords[i]) distances_cache[(m, i)] = tmp if tmp < min_cost: choice = m min_cost = tmp medoids[choice].append(i) total_cost += min_cost return total_cost, medoids def PAM(dataSet, k): m, n = shape(dataSet) # 数据集的行 iter_count = 0 # 随机选取K个聚类中心 CenterIndex = random.sample([i for i in range(m)], k) # 计算初始的代价和聚类结果 pre_cost, medoids = totalcost(dataSet, CenterIndex) current_cost = inf best_choice = [] best_res = { } itr = 5 while itr: # 遍历所有中心点 for m in medoids: # 逐个选取中心点的簇中的数据,进行替代计算 for item in medoids[m]: # 取的点不是中心点才计算 if item != m: # print("now replace is %s" % item) # 获取中心点m在类簇中的下标 # print("In for CenterIndex is %s" % CenterIndex) idx = CenterIndex.index(m) # print("now will be replaced index is %s" % idx) # 临时记录该数据,因为要被替换进行计算 swap_temp = CenterIndex[idx] # 进行替换 CenterIndex[idx] = item # 替换后的代价和类簇 tmp, medoids_ = totalcost(dataSet, CenterIndex) # 如果代价更小,那么就替换 if tmp < current_cost: # 进行替换,中心点的修改 best_choice = list(CenterIndex) # 类簇的修改 best_res = dict(medoids_) # 代价的修改 current_cost = tmp # 将中心点进行复原,重复上面的操作直到所有的非中心点数据计算完毕才选择一个最小的,而不是选择目前算的更小值 CenterIndex[idx] = swap_temp # 若果当前计算的最好的类簇的中心和前一次的中心是一样的,那么认为趋于稳定,结束计算 if best_choice == CenterIndex: break # 否则那么更新,重复上面的步骤 if current_cost <= pre_cost: pre_cost = current_cost medoids = best_res CenterIndex = best_choice itr -= 1 print(itr) # 返回最小代价,中心点,划分的聚类结果 # current_cost, best_choice, best_res m, n = shape(dataSet) centriods = mat(zeros((k, n))) for i in range(k): centriods[i, :] = dataSet[best_choice[i], :] clusterA = mat(zeros((m, 1))) n = 0 for i in list(best_res.keys()): for j in best_res[i]: clusterA[j, 0] = n n += 1 return centriods, clusterA def fun1(count): # waveform + kmeans dataset = pd.read_csv('waveform.csv',header=None) data = mat(dataset)[:,1:22] myCentroids, clustAssing = kmeans(data, 3) show1(data, 3, myCentroids, clustAssing, count) def fun2(count): # gauss_noise + waveform + kmeans dataset = pd.read_csv('waveform.csv',header=None) data = mat(dataset)[:,1:22] data = data_gauss_noise(data) myCentroids, clustAssing = kmeans(data, 3) show1(data, 3, myCentroids, clustAssing, count) def fun3(count): # lena + kmeans q = Image.open('lena.jpg') q = q.convert('L') q.save("lena_1.png") m, n = q.size q1 = array(q) q1 = q1.reshape((m * n, 1)) k = 3 Centroids, clustAssing = kmeans(q1, k) y_new = array(clustAssing).reshape((n, m)).astype(int16) pic_new = Image.new("L", (m, n)) for i in range(m): for j in range(n): pic_new.putpixel((i, j), tuple([int(x) for x in Centroids[y_new[j][i]]])) pic_new.save("Figure_"+str(count)+".png") def fun4(count): # gauss_noise + lena + kmeans q = Image.open('lena.jpg') q = q.convert('L') gauss_img = image_gauss_noise(np.array(q)) q = Image.fromarray(gauss_img) q.save("lena_2.png") m, n = q.size q1 = array(q) q1 = q1.reshape((m * n, 1)) k = 3 Centroids, clustAssing = kmeans(q1, k) y_new = array(clustAssing).reshape((n, m)).astype(int16) pic_new = Image.new("L", (m, n)) for i in range(m): for j in range(n): pic_new.putpixel((i, j), tuple([int(x) for x in Centroids[y_new[j][i]]])) pic_new.save("Figure_"+str(count)+".png") def fun5(count): # waveform + kmeans dataset = pd.read_csv('waveform.csv', header=None) data = mat(dataset)[:, 1:22] myCentroids, clustAssing = PAM(data, 3) show1(data, 3, myCentroids, clustAssing, count) def fun6(count): # gauss_noise + waveform + kmeans dataset = pd.read_csv('waveform.csv', header=None) data = mat(dataset)[:, 1:22] data = data_gauss_noise(data) myCentroids, clustAssing = PAM(data, 3) show1(data, 3, myCentroids, clustAssing, count) def fun7(count): # lena + kmeans q = Image.open('lena.jpg') q = q.convert('L') m, n = q.size q1 = array(q) q1 = q1.reshape((m * n, 1)) k = 3 Centroids, clustAssing = PAM(q1, k) y_new = array(clustAssing).reshape((n, m)).astype(int16) pic_new = Image.new("L", (m, n)) for i in range(m): for j in range(n): pic_new.putpixel((i, j), tuple([int(x) for x in Centroids[y_new[j][i]]])) pic_new.save("Figure_"+str(count)+".png") def fun8(count): # gauss_noise + lena + kmeans q = Image.open('lena.jpg') q = q.convert('L') gauss_img = image_gauss_noise(np.array(q)) q = Image.fromarray(gauss_img) m, n = q.size q1 = array(q) q1 = q1.reshape((m * n, 1)) k = 3 Centroids, clustAssing = PAM(q1, k) y_new = array(clustAssing).reshape((n, m)).astype(int16) pic_new = Image.new("L", (m, n)) for i in range(m): for j in range(n): pic_new.putpixel((i, j), tuple([int(x) for x in Centroids[y_new[j][i]]])) pic_new.save("Figure_"+str(count)+".png") if '__main__' == __name__: fun1(1) fun2(2) fun3(3) fun4(4) fun5(5) fun6(6) fun7(7) fun8(8)