运行环境:win10 64位 py 3.6 pycharm 2018.1.1
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn import cluster
from sklearn.metrics import adjusted_rand_score
from sklearn import mixture
def create_data(centers,num=100,std=0.7):
X, labels_true = make_blobs(n_samples=num,centers=centers,cluster_std=std)
return X, labels_true
def plot_data(*data):
X, labels_true = data
labels = np.unique(labels_true)
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
colors = 'rgbyckm'
for i, label in enumerate(labels):
position = labels_true == label
ax.scatter(X[position,0],X[position,1],label='cluster %d'%label,color=colors[i%len(colors)])
ax.legend(loc='best',framealpha=0.5)
ax.set_xlabel('x[0]')
ax.set_ylabel('x[1]')
ax.set_title('data')
plt.show()
X,labels_true = create_data([[1,1],[2,2],[1,2],[10,20]],1000,0.5)
plot_data(X,labels_true)
def test_Kmeans(*data):
X, labels_true = data
clst = cluster.KMeans()
clst.fit(X)
predicted_labels = clst.predict(X)
print('ARI:%s'%adjusted_rand_score(labels_true,predicted_labels))
print('sum center distance%s'%clst.inertia_)
centers = [[1,1],[2,2],[1,2],[10,20]]
X, labels_true = create_data(centers,1000,0.5)
test_Kmeans(X, labels_true)
def test_Kmeans_nclusters(*data):
X, labels_true = data
nums = range(1,50)
ARIs = []
Distances = []
for num in nums:
clst = cluster.KMeans(n_clusters=num)
clst.fit(X)
predicted_labels = clst.predict(X)
ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
Distances.append(clst.inertia_)
fig = plt.figure()
ax = fig.add_subplot(1,2,1)
ax.plot(nums,ARIs,marker='+')
ax.set_xlabel('n_clusters')
ax.set_ylabel('ARI')
ax=fig.add_subplot(1,2,2)
ax.plot(nums,Distances,marker='o')
ax.set_xlabel('n_clusters')
ax.set_ylabel('inertia_')
fig.suptitle('KMeans')
plt.show()
centers = [[1,1],[2,2],[1,2],[10,20]]
X, labels_true = create_data(centers,1000,0.5)
test_Kmeans_nclusters(X, labels_true)
def test_Kmeans_n_init(*data):
X, labels_true = data
nums = range(1,50)
fig = plt.figure()
ARIs_k = []
Distances_k = []
ARIs_r = []
Distances_r = []
for num in nums:
clst = cluster.KMeans(n_init=num,init='k-means++')
clst.fit(X)
predicted_labels = clst.predict(X)
ARIs_k.append(adjusted_rand_score(labels_true,predicted_labels))
Distances_k.append(clst.inertia_)
clst = cluster.KMeans(n_init=num,init='random')
clst.fit(X)
predicted_labels = clst.predict(X)
ARIs_r.append(adjusted_rand_score(labels_true,predicted_labels))
Distances_r.append(clst.inertia_)
ax = fig.add_subplot(1,2,1)
ax.plot(nums,ARIs_k,marker='+',label='k-means++')
ax.plot(nums,ARIs_r,marker='+',label='random')
ax.set_xlabel('n_init')
ax.set_ylabel('ARI')
ax.legend(loc='best')
ax = fig.add_subplot(1,2,2)
ax.plot(nums,Distances_k,marker='o',label='k-means++')
ax.plot(nums,Distances_r,marker='o',label='random')
ax.set_xlabel('n_init')
ax.set_ylabel('inertia_')
ax.legend(loc='best')
fig.suptitle('Kmeans')
plt.show()
def test_DBSCAN(*data):
X, labels_true = data
clst = cluster.DBSCAN()
predicted_labels = clst.fit_predict(X)
print("ARI:%s"% adjusted_rand_score(labels_true, predicted_labels))
print("Core sample num:%d"%len(clst.core_sample_indices_))
centers = [[1,1],[2,2],[1,2],[10,20]]
X, labels_true = create_data(centers,1000,0.5)
test_DBSCAN(X, labels_true)
def test_DBSCAN_epsilon(*data):
X,labels_true = data
epsilons = np.logspace(-1, 1.5)
ARIs = []
Core_nums = []
for epsilon in epsilons:
clst = cluster.DBSCAN(eps=epsilon)
predicted_labels = clst.fit_predict(X)
ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
Core_nums.append(len(clst.core_sample_indices_))
fig = plt.figure()
ax = fig.add_subplot(1,2,1)
ax.plot(epsilons,ARIs,marker='+')
ax.set_xscale('log')
ax.set_xlabel(r'$\epsilon$')
ax.set_ylim(0, 1)
ax.set_ylabel('ARI')
ax = fig.add_subplot(1, 2, 2)
ax.plot(epsilons, Core_nums, marker='o')
ax.set_xscale('log')
ax.set_xlabel(r'$\epsilon$')
ax.set_ylabel('Core_Nums')
fig.suptitle('DBSCAN')
plt.show()
centers = [[1,1],[2,2],[1,2],[10,20]]
X, labels_true = create_data(centers,1000,0.5)
test_DBSCAN_epsilon(X, labels_true)
def test_DBSCAN_min_samples(*data):
X, labels_true = data
min_samples = range(1,100)
ARIs = []
Core_nums = []
for num in min_samples:
clst = cluster.DBSCAN(min_samples=num)
predicted_labels = clst.fit_predict(X)
ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
Core_nums.append(len(clst.core_sample_indices_))
fig = plt.figure()
ax = fig.add_subplot(1,2,1)
ax.plot(min_samples,ARIs,marker='+')
ax.set_xlabel('min_samples')
ax.set_ylim(0, 1)
ax.set_ylabel('ARI')
ax = fig.add_subplot(1,2,2)
ax.plot(min_samples, Core_nums,marker='o')
ax.set_xlabel('min_samples')
ax.set_ylabel("Core_Nums")
fig.suptitle("DBSCAN")
plt.show()
centers = [[1,1],[2,2],[1,2],[10,20]]
X, labels_true = create_data(centers,1000,0.5)
test_DBSCAN_min_samples(X, labels_true)
def test_AgglomerativeClustering(*data):
X, labels_true = data
clst = cluster.AgglomerativeClustering()
predicted_labels = clst.fit_predict(X)
print("ARI:%s"%adjusted_rand_score(labels_true,predicted_labels))
centers = [[1,1],[2,2],[1,2],[10,20]]
X, labels_true = create_data(centers,1000,0.5)
test_AgglomerativeClustering(X, labels_true)
def test_AgglomerativeClustering_nclusters(*data):
X, labels_true = data
nums = range(1, 50)
ARIs = []
for num in nums:
clst = cluster.AgglomerativeClustering(n_clusters=num)
predicted_labels = clst.fit_predict(X)
ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(nums,ARIs,marker='+')
ax.set_xlabel("n_clusters")
ax.set_ylabel('ARI')
fig.suptitle("AgglomerativeClustering")
plt.show()
centers = [[1,1],[2,2],[1,2],[10,20]]
X, labels_true = create_data(centers,1000,0.5)
test_AgglomerativeClustering_nclusters(X, labels_true)
def test_AgglomerativeClustering_linkage(*data):
X, labels_true = data
nums = range(1, 50)
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
linkages = ['ward','complete','average']
markers = '+o*'
for i, linkage in enumerate(linkages):
ARIs = []
for num in nums:
clst = cluster.AgglomerativeClustering(n_clusters=num,linkage=linkage)
predicted_labels = clst.fit_predict(X)
ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
ax.plot(nums,ARIs,marker = markers[i],label='linkage:%s'%linkage)
ax.set_xlabel('n_clusters')
ax.set_ylabel('ARI')
ax.legend(loc='best')
fig.suptitle('AgglomerativeClustering')
plt.show()
centers = [[1,1],[2,2],[1,2],[10,20]]
X, labels_true = create_data(centers,1000,0.5)
test_AgglomerativeClustering_linkage(X, labels_true)
def test_GMM(*data):
X, labels_true = data
clst = mixture.GaussianMixture()
clst_m = clst.fit(X)
predicted_labels = clst.predict(X)
print("ARI:%s" % adjusted_rand_score(labels_true, predicted_labels))
centers = [[1,1],[2,2],[1,2],[10,20]]
X, labels_true = create_data(centers,1000,0.5)
test_GMM(X, labels_true)
def test_GMM_n_components(*data):
X, labels_true = data
nums = range(1, 50)
ARIs = []
for num in nums:
clst = mixture.GaussianMixture(n_components=num)
clst.fit(X)
predicted_labels = clst.predict(X)
ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(nums,ARIs,marker='+')
ax.set_xlabel('n_components')
ax.set_ylabel('ARI')
fig.suptitle('GMM')
plt.show()
centers = [[1,1],[2,2],[1,2],[10,20]]
X, labels_true = create_data(centers,1000,0.5)
test_GMM_n_components(X, labels_true)
def test_GMM_n_components(*data):
X, labels_true = data
nums = range(1, 50)
ARIs = []
for num in nums:
clst = mixture.GaussianMixture(n_components=num)
clst.fit(X)
predicted_labels = clst.predict(X)
ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.plot(nums,ARIs,marker='+')
ax.set_xlabel('n_components')
ax.set_ylabel('ARI')
fig.suptitle('GMM')
plt.show()
centers = [[1,1],[2,2],[1,2],[10,20]]
X, labels_true = create_data(centers,1000,0.5)
test_GMM_n_components(X, labels_true)
def test_GMM_cov_type(*data):
X, labels_true = data
nums = range(1, 50)
cov_types = ['spherical','tied','diag','full']
markers = '+o*s'
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
for i,cov_type in enumerate(cov_types):
ARIs = []
for num in nums:
clst = mixture.GaussianMixture(n_components=num,covariance_type=cov_type)
clst.fit(X)
predicted_labels = clst.predict(X)
ARIs.append(adjusted_rand_score(labels_true,predicted_labels))
ax.plot(nums,ARIs,marker = markers[i],label='covariance_type:%s'%cov_type)
ax.set_xlabel("n_components")
ax.legend(loc='best')
ax.set_ylabel("ARI")
fig.suptitle('GMM')
plt.show()
centers = [[1,1],[2,2],[1,2],[10,20]]
X, labels_true = create_data(centers,1000,0.5)
test_GMM_cov_type(X, labels_true)