import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift,estimate_bandwidth
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
'''
data From:http://archive.ics.uci.edu/ml/datasets/Dishonest+Internet+users+Dataset
'''
#读取数据
dataset = []
with open('clustering.txt','r') as f:
for line in f.readlines():
attrs = line[0:-1].split(' ')
dataset.append(attrs)
dataset = np.array(dataset)
#预处理
for index,value in enumerate(dataset[0]):
if value.isdigit():
dataset[:,index] = dataset[:,index].astype(int)
else:
encoder = preprocessing.LabelEncoder()
dataset[:,index]=encoder.fit_transform(dataset[:,index])
dataset = dataset.astype(int)
#K均值聚类
model = KMeans(n_clusters=3,init='k-means++',n_init=10,random_state=5)
model.fit(dataset)
predict_labels = model.predict(dataset)
print('----------KMEANS----------')
print('Clustering center:\n',model.cluster_centers_)
score = metrics.silhouette_score(dataset,predict_labels,metric='euclidean',sample_size=len(dataset)) #聚类效果的指标
print('silhouette_score:',score)
#均值漂移聚类
bw = estimate_bandwidth(dataset,quantile=0.1,n_samples=len(dataset)) #设置带宽
model = MeanShift(bandwidth=bw,bin_seeding=True)
model.fit(dataset)
predict_labels = model.labels_
print('----------MEANSHIFT----------')
clusters = len(np.unique(predict_labels))
print('Clustering num:',clusters)
print('Clustering center:\n',model.cluster_centers_)
score = metrics.silhouette_score(dataset,predict_labels,metric='euclidean',sample_size=len(dataset)) #聚类效果的指标
print('silhouette_score:',score)
#凝聚层次聚类
model = AgglomerativeClustering(n_clusters=3,linkage='ward')
model.fit(dataset)
predict_labels = model.labels_
score = metrics.silhouette_score(dataset,predict_labels,metric='euclidean',sample_size=len(dataset)) #聚类效果的指标
print('----------AgglomerativeClustering----------')
print('silhouette_score:',score)
#DBSCAN聚类
print('----------DBSCAN----------')
eps = np.linspace(0.1,1.5,num=12)
scores = []
best_eps,best_score = eps[0],-1
best_model,best_labels = None,None
for e in eps:
model = DBSCAN(eps=e,min_samples=5,metric='euclidean')
model.fit(dataset)
predict_labels = model.labels_
score = round(metrics.silhouette_score(dataset,predict_labels,metric='euclidean',sample_size=len(dataset)),4)
scores.append(score)
print('eps = ',e,' score = ',score)
if score > best_score:
best_eps = e
best_score = score
best_model = model
best_labels = predict_labels
print('Best eps = ',best_eps,'Best score = ',best_score)
model = DBSCAN(eps=best_eps,min_samples=5,metric='euclidean')
model.fit(dataset)
print('Best clusters = ',len(np.unique(predict_labels)))