EM算法进行GMM参数估计的Python实现

要求

Please build a Gaussian mixture model (GMM) to model the data in file TrainingData_GMM.csv. Note that the data is composed of 4 clusters, and the model should be trained by expectation maximization (EM) algorithm
Based on the GMM learned above, assign each training data point into one of 4 different clusters

代码运行效果

原始数据
GMM参数估计
对数似然函数值随训练步数的变化情况

代码

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import math


def get_pdf(sample, mu, sigma):
    res = stats.multivariate_normal(mu, sigma).pdf(sample)
    return res


def get_log_likelihood(data, k, mu, sigma, gama):
    res = 0.0
    for i in range(len(data)):
        cur = 0.0
        for j in range(len(k)):
            cur += gama[j][i] * get_pdf(data[i], mu[j], sigma[j])
        res += math.log(cur)
    return res


def em(data, k, mu, sigma, steps=1000):
    num_gau = len(k)  # 高斯分布个数
    num_data = data.shape[0]  # 数据个数
    gama = np.zeros((num_gau, num_data))  # gama[j][i]表示第i个样本点来自第j个高斯模型的概率
    likelihood_record = []  # 记录每一次迭代的log-likelihood值
    for step in range(steps):
        # 计算gama矩阵
        for i in range(num_gau):
            for j in range(num_data):
                gama[i][j] = k[i] * get_pdf(data[j], mu[i], sigma[i]) / \
                             sum([k[t] * get_pdf(data[j], mu[t], sigma[t]) for t in range(num_gau)])
        cur_likelihood = get_log_likelihood(data, k, mu, sigma, gama)  # 计算当前log-likelihood
        likelihood_record.append(cur_likelihood)
        # 更新mu
        for i in range(num_gau):
            mu[i] = np.dot(gama[i], data) / np.sum(gama[i])
        # 更新sigma
        for i in range(num_gau):
            cov = [np.dot((data[t] - mu[i]).reshape(-1, 1), (data[t] - mu[i]).reshape(1, -1)) for t in range(num_data)]
            cov_sum = np.zeros((2, 2))
            for j in range(num_data):
                cov_sum += gama[i][j] * cov[j]
            sigma[i] = cov_sum / np.sum(gama[i])
        # 更新k
        for i in range(num_gau):
            k[i] = np.sum(gama[i]) / num_data
        print('step: {}\tlikelihood:{}'.format(step + 1, cur_likelihood))
    return k, mu, sigma, gama, likelihood_record


def main():
    csv_data = pd.read_csv('TrainingData_GMM.csv')
    csv_data = np.array(csv_data)
    # 原始数据散点图
    plt.scatter(csv_data[:, 0], csv_data[:, 1], s=5, c='r')
    plt.title('Raw Data')
    plt.show()
    # GMM参数初始值
    k_init = [0.25, 0.25, 0.25, 0.25]
    mu_init = [[-0.5, -0.5],
               [-0.4, 0.4],
               [0.1, 0.1],
               [1.2, -1.2]]
    sigma_init = [[[1, 0.5], [0.5, 1]],
                  [[0.5, 0], [0, 0.5]],
                  [[0.3, 0], [0, 0.3]],
                  [[0.5, -0.1], [-0.1, 0.5]]]
    k_res, mu_res, sigma_res, gama, likelihood_record = em(csv_data, k_init, mu_init, sigma_init, steps=30)  # EM算法
    # 根据EM算法的结果画分类图
    classify = gama.argmax(axis=0)  # 计算每个样本点所属的单高斯模型
    four_gau = [[], [], [], []]  # 把数据集分成四份，每份属于一个单高斯模型
    for i in range(len(classify)):
        four_gau[classify[i]].append(csv_data[i])
    # 转成numpy，方便画散点图
    four_gau = np.array(four_gau)
    for i in range(len(four_gau)):
        four_gau[i] = np.array(four_gau[i])
    colors = ['r', 'g', 'k', 'b']
    for i in range(len(k_res)):
        plt.scatter(four_gau[i][:, 0], four_gau[i][:, 1], s=5, c=colors[i])
    plt.title('After EM')
    plt.show()
    # 画likelihood的变化曲线图
    plt.plot([n for n in range(len(likelihood_record))], likelihood_record)
    plt.title('step-likelihood')
    plt.xlabel('step')
    plt.ylabel('log-likelihood')
    plt.show()
    print('result:\nk: {}\nmu: {}\nsigma: {}'.format(k_res, mu_res, sigma_res))


if __name__ == '__main__':
    main()

EM算法进行GMM参数估计的Python实现

要求

代码运行效果

代码

猜你喜欢