Please build a Gaussian mixture model (GMM) to model the data in file TrainingData_GMM.csv. Note that the data is composed of 4 clusters, and the model should be trained by expectation maximization (EM) algorithm
Based on the GMM learned above, assign each training data point into one of 4 different clusters
代码运行效果
原始数据
GMM参数估计
对数似然函数值随训练步数的变化情况
代码
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import math
defget_pdf(sample, mu, sigma):
res = stats.multivariate_normal(mu, sigma).pdf(sample)return res
defget_log_likelihood(data, k, mu, sigma, gama):
res =0.0for i inrange(len(data)):
cur =0.0for j inrange(len(k)):
cur += gama[j][i]* get_pdf(data[i], mu[j], sigma[j])
res += math.log(cur)return res
defem(data, k, mu, sigma, steps=1000):
num_gau =len(k)# 高斯分布个数
num_data = data.shape[0]# 数据个数
gama = np.zeros((num_gau, num_data))# gama[j][i]表示第i个样本点来自第j个高斯模型的概率
likelihood_record =[]# 记录每一次迭代的log-likelihood值for step inrange(steps):# 计算gama矩阵for i inrange(num_gau):for j inrange(num_data):
gama[i][j]= k[i]* get_pdf(data[j], mu[i], sigma[i])/ \
sum([k[t]* get_pdf(data[j], mu[t], sigma[t])for t inrange(num_gau)])
cur_likelihood = get_log_likelihood(data, k, mu, sigma, gama)# 计算当前log-likelihood
likelihood_record.append(cur_likelihood)# 更新mufor i inrange(num_gau):
mu[i]= np.dot(gama[i], data)/ np.sum(gama[i])# 更新sigmafor i inrange(num_gau):
cov =[np.dot((data[t]- mu[i]).reshape(-1,1),(data[t]- mu[i]).reshape(1,-1))for t inrange(num_data)]
cov_sum = np.zeros((2,2))for j inrange(num_data):
cov_sum += gama[i][j]* cov[j]
sigma[i]= cov_sum / np.sum(gama[i])# 更新kfor i inrange(num_gau):
k[i]= np.sum(gama[i])/ num_data
print('step: {}\tlikelihood:{}'.format(step +1, cur_likelihood))return k, mu, sigma, gama, likelihood_record
defmain():
csv_data = pd.read_csv('TrainingData_GMM.csv')
csv_data = np.array(csv_data)# 原始数据散点图
plt.scatter(csv_data[:,0], csv_data[:,1], s=5, c='r')
plt.title('Raw Data')
plt.show()# GMM参数初始值
k_init =[0.25,0.25,0.25,0.25]
mu_init =[[-0.5,-0.5],[-0.4,0.4],[0.1,0.1],[1.2,-1.2]]
sigma_init =[[[1,0.5],[0.5,1]],[[0.5,0],[0,0.5]],[[0.3,0],[0,0.3]],[[0.5,-0.1],[-0.1,0.5]]]
k_res, mu_res, sigma_res, gama, likelihood_record = em(csv_data, k_init, mu_init, sigma_init, steps=30)# EM算法# 根据EM算法的结果画分类图
classify = gama.argmax(axis=0)# 计算每个样本点所属的单高斯模型
four_gau =[[],[],[],[]]# 把数据集分成四份,每份属于一个单高斯模型for i inrange(len(classify)):
four_gau[classify[i]].append(csv_data[i])# 转成numpy,方便画散点图
four_gau = np.array(four_gau)for i inrange(len(four_gau)):
four_gau[i]= np.array(four_gau[i])
colors =['r','g','k','b']for i inrange(len(k_res)):
plt.scatter(four_gau[i][:,0], four_gau[i][:,1], s=5, c=colors[i])
plt.title('After EM')
plt.show()# 画likelihood的变化曲线图
plt.plot([n for n inrange(len(likelihood_record))], likelihood_record)
plt.title('step-likelihood')
plt.xlabel('step')
plt.ylabel('log-likelihood')
plt.show()print('result:\nk: {}\nmu: {}\nsigma: {}'.format(k_res, mu_res, sigma_res))if __name__ =='__main__':
main()