利用python内置K-Means聚类算法实现鸢尾花数据的聚类_python中的kmeans函数_木槿、的博客-CSDN博客
https://www.cnblogs.com/wangxiaowu/p/13292156.html
把上面两个参考链接的代码拼一下就出来了
#############K-means-鸢尾花聚类############
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from sklearn.metrics import f1_score # 导入f1_score
iris = load_iris()
X = iris.data[:] ##表示我们只取特征空间中的后两个维度
Y=iris.target
estimator = KMeans(n_clusters=3)#构造聚类器
estimator.fit(X)#聚类
label_pred = estimator.labels_ #获取聚类标签
micro=f1_score(Y,label_pred, average='micro') # 调用并输出计算的值
macro=f1_score(Y,label_pred, average='macro')
print(micro,macro)
print(label_pred)
但是结果不太对,聚类标签的顺序会影响评价结果。我以为评价函数考虑到这个问题了,实际并没有,继续查资料
(venv) D:\code2021\experiment\GraphEmbedding-master\examples>python kmeans_p.py
0.24 0.2727272727272727
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
2 0]
(venv) D:\code2021\experiment\GraphEmbedding-master\examples>python kmeans_p.py
0.8933333333333333 0.8917748917748917
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2 2 2 1 2 2 2 1 2
2 1]
#######################################################
这是很久以前的草稿了,实际上用最优匹配算法匹配两个结果的对应关系,然后运行评分算法就行。
下面给出matlab的聚类评价方法,包括8个评价指标
result = [Fscore Precision Recall nmi AR Entropy ACC Purity];
%cluster8measure.m
function result = Clustering8Measure(Y, predY)
if size(Y,2) ~= 1
Y = Y';
end;
if size(predY,2) ~= 1
predY = predY';
end;
n = length(Y);
uY = unique(Y);
nclass = length(uY);
Y0 = zeros(n,1);
if nclass ~= max(Y)
for i = 1:nclass
Y0(find(Y == uY(i))) = i;
end;
Y = Y0;
end;
uY = unique(predY);
nclass = length(uY);
predY0 = zeros(n,1);
if nclass ~= max(predY)
for i = 1:nclass
predY0(find(predY == uY(i))) = i;
end;
predY = predY0;
end;
Lidx = unique(Y); classnum = length(Lidx);
predLidx = unique(predY); pred_classnum = length(predLidx);
% purity
correnum = 0;
for ci = 1:pred_classnum
incluster = Y(find(predY == predLidx(ci)));
% cnub = unique(incluster);
% inclunub = 0;
% for cnubi = 1:length(cnub)
% inclunub(cnubi) = length(find(incluster == cnub(cnubi)));
% end;
inclunub = hist(incluster, 1:max(incluster)); if isempty(inclunub) inclunub=0;end;
correnum = correnum + max(inclunub);
end;
Purity = correnum/length(predY);
%if pred_classnum
res = bestMap(Y, predY);
% accuarcy
ACC = length(find(Y == res))/length(Y);
% NMI
MIhat = MutualInfo(Y,res);
[Fscore Precision Recall] = compute_f(Y, predY);
[nmi Entropy] = compute_nmi(Y, predY);
AR=RandIndex(Y, predY);
result = [Fscore Precision Recall nmi AR Entropy ACC Purity];
%result = [Fscore Precision Recall nmi AR Entropy];
%%
function [newL2, c] = bestMap(L1,L2)
%bestmap: permute labels of L2 match L1 as good as possible
% [newL2] = bestMap(L1,L2);
%===========
L1 = L1(:);
L2 = L2(:);
if size(L1) ~= size(L2)
error('size(L1) must == size(L2)');
end
L1 = L1 - min(L1) + 1; % min (L1) <- 1;
L2 = L2 - min(L2) + 1; % min (L2) <- 1;
%=========== make bipartition graph ============
nClass = max(max(L1), max(L2));
G = zeros(nClass);
for i=1:nClass
for j=1:nClass
G(i,j) = length(find(L1 == i & L2 == j));
end
end
%=========== assign with hungarian method ======
[c,t] = hungarian(-G);
newL2 = zeros(nClass,1);
for i=1:nClass
newL2(L2 == i) = c(i);
end
%%
function MIhat = MutualInfo(L1,L2)
% mutual information
%===========
L1 = L1(:);
L2 = L2(:);
if size(L1) ~= size(L2)
error('size(L1) must == size(L2)');
end
L1 = L1 - min(L1) + 1; % min (L1) <- 1;
L2 = L2 - min(L2) + 1; % min (L2) <- 1;
%=========== make bipartition graph ============
nClass = max(max(L1), max(L2));
G = zeros(nClass);
for i=1:nClass
for j=1:nClass
G(i,j) = length(find(L1 == i & L2 == j))+eps;
end
end
sumG = sum(G(:));
%=========== calculate MIhat
P1 = sum(G,2); P1 = P1/sumG;
P2 = sum(G,1); P2 = P2/sumG;
H1 = sum(-P1.*log2(P1));
H2 = sum(-P2.*log2(P2));
P12 = G/sumG;
PPP = P12./repmat(P2,nClass,1)./repmat(P1,1,nClass);
PPP(abs(PPP) < 1e-12) = 1;
MI = sum(P12(:) .* log2(PPP(:)));
MIhat = MI / max(H1,H2);
%%%%%%%%%%%%% why complex ? %%%%%%%%
MIhat = real(MIhat);
%%
function [C,T]=hungarian(A)
%HUNGARIAN Solve the Assignment problem using the Hungarian method.
%
%[C,T]=hungarian(A)
%A - a square cost matrix.
%C - the optimal assignment.
%T - the cost of the optimal assignment.
%s.t. T = trace(A(C,:)) is minimized over all possible assignments.
% Adapted from the FORTRAN IV code in Carpaneto and Toth, "Algorithm 548:
% Solution of the assignment problem [H]", ACM Transactions on
% Mathematical Software, 6(1):104-111, 1980.
% v1.0 96-06-14. Niclas Borlin, [email protected].
% Department of Computing Science, Ume?University,
% Sweden.
% All standard disclaimers apply.
% A substantial effort was put into this code. If you use it for a
% publication or otherwise, please include an acknowledgement or at least
% notify me by email. /Niclas
[m,n]=size(A);
if (m~=n)
error('HUNGARIAN: Cost matrix must be square!');
end
% Save original cost matrix.
orig=A;
% Reduce matrix.
A=hminired(A);
% Do an initial assignment.
[A,C,U]=hminiass(A);
% Repeat while we have unassigned rows.
while (U(n+1))
% Start with no path, no unchecked zeros, and no unexplored rows.
LR=zeros(1,n);
LC=zeros(1,n);
CH=zeros(1,n);
RH=[zeros(1,n) -1];
% No labelled columns.
SLC=[];
% Start path in first unassigned row.
r=U(n+1);
% Mark row with end-of-path label.
LR(r)=-1;
% Insert row first in labelled row set.
SLR=r;
% Repeat until we manage to find an assignable zero.
while (1)
% If there are free zeros in row r
if (A(r,n+1)~=0)
% ...get column of first free zero.
l=-A(r,n+1);
% If there are more free zeros in row r and row r in not
% yet marked as unexplored..
if (A(r,l)~=0 & RH(r)==0)
% Insert row r first in unexplored list.
RH(r)=RH(n+1);
RH(n+1)=r;
% Mark in which column the next unexplored zero in this row
% is.
CH(r)=-A(r,l);
end
else
% If all rows are explored..
if (RH(n+1)<=0)
% Reduce matrix.
[A,CH,RH]=hmreduce(A,CH,RH,LC,LR,SLC,SLR);
end
% Re-start with first unexplored row.
r=RH(n+1);
% Get column of next free zero in row r.
l=CH(r);
% Advance "column of next free zero".
CH(r)=-A(r,l);
% If this zero is last in the list..
if (A(r,l)==0)
% ...remove row r from unexplored list.
RH(n+1)=RH(r);
RH(r)=0;
end
end
% While the column l is labelled, i.e. in path.
while (LC(l)~=0)
% If row r is explored..
if (RH(r)==0)
% If all rows are explored..
if (RH(n+1)<=0)
% Reduce cost matrix.
[A,CH,RH]=hmreduce(A,CH,RH,LC,LR,SLC,SLR);
end
% Re-start with first unexplored row.
r=RH(n+1);
end
% Get column of next free zero in row r.
l=CH(r);
% Advance "column of next free zero".
CH(r)=-A(r,l);
% If this zero is last in list..
if(A(r,l)==0)
% ...remove row r from unexplored list.
RH(n+1)=RH(r);
RH(r)=0;
end
end
% If the column found is unassigned..
if (C(l)==0)
% Flip all zeros along the path in LR,LC.
[A,C,U]=hmflip(A,C,LC,LR,U,l,r);
% ...and exit to continue with next unassigned row.
break;
else
% ...else add zero to path.
% Label column l with row r.
LC(l)=r;
% Add l to the set of labelled columns.
SLC=[SLC l];
% Continue with the row assigned to column l.
r=C(l);
% Label row r with column l.
LR(r)=l;
% Add r to the set of labelled rows.
SLR=[SLR r];
end
end
end
% Calculate the total cost.
T=sum(orig(logical(sparse(C,1:size(orig,2),1))));
function A=hminired(A)
%HMINIRED Initial reduction of cost matrix for the Hungarian method.
%
%B=assredin(A)
%A - the unreduced cost matris.
%B - the reduced cost matrix with linked zeros in each row.
% v1.0 96-06-13. Niclas Borlin, [email protected].
[m,n]=size(A);
% Subtract column-minimum values from each column.
colMin=min(A);
A=A-colMin(ones(n,1),:);
% Subtract row-minimum values from each row.
rowMin=min(A')';
A=A-rowMin(:,ones(1,n));
% Get positions of all zeros.
[i,j]=find(A==0);
% Extend A to give room for row zero list header column.
A(1,n+1)=0;
for k=1:n
% Get all column in this row.
cols=j(k==i)';
% Insert pointers in matrix.
A(k,[n+1 cols])=[-cols 0];
end
function [A,C,U]=hminiass(A)
%HMINIASS Initial assignment of the Hungarian method.
%
%[B,C,U]=hminiass(A)
%A - the reduced cost matrix.
%B - the reduced cost matrix, with assigned zeros removed from lists.
%C - a vector. C(J)=I means row I is assigned to column J,
% i.e. there is an assigned zero in position I,J.
%U - a vector with a linked list of unassigned rows.
% v1.0 96-06-14. Niclas Borlin, [email protected].
[n,np1]=size(A);
% Initalize return vectors.
C=zeros(1,n);
U=zeros(1,n+1);
% Initialize last/next zero "pointers".
LZ=zeros(1,n);
NZ=zeros(1,n);
for i=1:n
% Set j to first unassigned zero in row i.
lj=n+1;
j=-A(i,lj);
% Repeat until we have no more zeros (j==0) or we find a zero
% in an unassigned column (c(j)==0).
while (C(j)~=0)
% Advance lj and j in zero list.
lj=j;
j=-A(i,lj);
% Stop if we hit end of list.
if (j==0)
break;
end
end
if (j~=0)
% We found a zero in an unassigned column.
% Assign row i to column j.
C(j)=i;
% Remove A(i,j) from unassigned zero list.
A(i,lj)=A(i,j);
% Update next/last unassigned zero pointers.
NZ(i)=-A(i,j);
LZ(i)=lj;
% Indicate A(i,j) is an assigned zero.
A(i,j)=0;
else
% We found no zero in an unassigned column.
% Check all zeros in this row.
lj=n+1;
j=-A(i,lj);
% Check all zeros in this row for a suitable zero in another row.
while (j~=0)
% Check the in the row assigned to this column.
r=C(j);
% Pick up last/next pointers.
lm=LZ(r);
m=NZ(r);
% Check all unchecked zeros in free list of this row.
while (m~=0)
% Stop if we find an unassigned column.
if (C(m)==0)
break;
end
% Advance one step in list.
lm=m;
m=-A(r,lm);
end
if (m==0)
% We failed on row r. Continue with next zero on row i.
lj=j;
j=-A(i,lj);
else
% We found a zero in an unassigned column.
% Replace zero at (r,m) in unassigned list with zero at (r,j)
A(r,lm)=-j;
A(r,j)=A(r,m);
% Update last/next pointers in row r.
NZ(r)=-A(r,m);
LZ(r)=j;
% Mark A(r,m) as an assigned zero in the matrix . . .
A(r,m)=0;
% ...and in the assignment vector.
C(m)=r;
% Remove A(i,j) from unassigned list.
A(i,lj)=A(i,j);
% Update last/next pointers in row r.
NZ(i)=-A(i,j);
LZ(i)=lj;
% Mark A(r,m) as an assigned zero in the matrix . . .
A(i,j)=0;
% ...and in the assignment vector.
C(j)=i;
% Stop search.
break;
end
end
end
end
% Create vector with list of unassigned rows.
% Mark all rows have assignment.
r=zeros(1,n);
rows=C(C~=0);
r(rows)=rows;
empty=find(r==0);
% Create vector with linked list of unassigned rows.
U=zeros(1,n+1);
U([n+1 empty])=[empty 0];
function [A,C,U]=hmflip(A,C,LC,LR,U,l,r)
%HMFLIP Flip assignment state of all zeros along a path.
%
%[A,C,U]=hmflip(A,C,LC,LR,U,l,r)
%Input:
%A - the cost matrix.
%C - the assignment vector.
%LC - the column label vector.
%LR - the row label vector.
%U - the
%r,l - position of last zero in path.
%Output:
%A - updated cost matrix.
%C - updated assignment vector.
%U - updated unassigned row list vector.
% v1.0 96-06-14. Niclas Borlin, [email protected].
n=size(A,1);
while (1)
% Move assignment in column l to row r.
C(l)=r;
% Find zero to be removed from zero list..
% Find zero before this.
m=find(A(r,:)==-l);
% Link past this zero.
A(r,m)=A(r,l);
A(r,l)=0;
% If this was the first zero of the path..
if (LR(r)<0)
...remove row from unassigned row list and return.
U(n+1)=U(r);
U(r)=0;
return;
else
% Move back in this row along the path and get column of next zero.
l=LR(r);
% Insert zero at (r,l) first in zero list.
A(r,l)=A(r,n+1);
A(r,n+1)=-l;
% Continue back along the column to get row of next zero in path.
r=LC(l);
end
end
function [A,CH,RH]=hmreduce(A,CH,RH,LC,LR,SLC,SLR)
%HMREDUCE Reduce parts of cost matrix in the Hungerian method.
%
%[A,CH,RH]=hmreduce(A,CH,RH,LC,LR,SLC,SLR)
%Input:
%A - Cost matrix.
%CH - vector of column of 'next zeros' in each row.
%RH - vector with list of unexplored rows.
%LC - column labels.
%RC - row labels.
%SLC - set of column labels.
%SLR - set of row labels.
%
%Output:
%A - Reduced cost matrix.
%CH - Updated vector of 'next zeros' in each row.
%RH - Updated vector of unexplored rows.
% v1.0 96-06-14. Niclas Borlin, [email protected].
n=size(A,1);
% Find which rows are covered, i.e. unlabelled.
coveredRows=LR==0;
% Find which columns are covered, i.e. labelled.
coveredCols=LC~=0;
r=find(~coveredRows);
c=find(~coveredCols);
% Get minimum of uncovered elements.
m=min(min(A(r,c)));
% Subtract minimum from all uncovered elements.
A(r,c)=A(r,c)-m;
% Check all uncovered columns..
for j=c
% ...and uncovered rows in path order..
for i=SLR
% If this is a (new) zero..
if (A(i,j)==0)
% If the row is not in unexplored list..
if (RH(i)==0)
% ...insert it first in unexplored list.
RH(i)=RH(n+1);
RH(n+1)=i;
% Mark this zero as "next free" in this row.
CH(i)=j;
end
% Find last unassigned zero on row I.
row=A(i,:);
colsInList=-row(row<0);
if (length(colsInList)==0)
% No zeros in the list.
l=n+1;
else
l=colsInList(row(colsInList)==0);
end
% Append this zero to end of list.
A(i,l)=-j;
end
end
end
% Add minimum to all doubly covered elements.
r=find(coveredRows);
c=find(coveredCols);
% Take care of the zeros we will remove.
[i,j]=find(A(r,c)<=0);
i=r(i);
j=c(j);
for k=1:length(i)
% Find zero before this in this row.
lj=find(A(i(k),:)==-j(k));
% Link past it.
A(i(k),lj)=A(i(k),j(k));
% Mark it as assigned.
A(i(k),j(k))=0;
end
A(r,c)=A(r,c)+m;
function [f,p,r] = compute_f(T,H)
if length(T) ~= length(H),
size(T)
size(H)
end;
N = length(T);
numT = 0;
numH = 0;
numI = 0;
for n=1:N,
Tn = (T(n+1:end))==T(n);
Hn = (H(n+1:end))==H(n);
numT = numT + sum(Tn);
numH = numH + sum(Hn);
numI = numI + sum(Tn .* Hn);
end;
p = 1;
r = 1;
f = 1;
if numH > 0,
p = numI / numH;
end;
if numT > 0,
r = numI / numT;
end;
if (p+r) == 0,
f = 0;
else
f = 2 * p * r / (p + r);
end;
function [nmi avgent] = compute_nmi (T, H)
N = length(T);
classes = unique(T);
clusters = unique(H);
num_class = length(classes);
num_clust = length(clusters);
%%compute number of points in each class
for j=1:num_class
index_class = (T(:)==classes(j));
D(j) = sum(index_class);
end
%%mutual information
mi = 0;
A = zeros(num_clust, num_class);
avgent = 0;
for i=1:num_clust
%number of points in cluster 'i'
index_clust = (H(:)==clusters(i));
B(i) = sum(index_clust);
for j=1:num_class
index_class = (T(:)==classes(j));
%%compute number of points in class 'j' that end up in cluster 'i'
A(i,j) = sum(index_class.*index_clust);
if (A(i,j) ~= 0)
miarr(i,j) = A(i,j)/N * log2 (N*A(i,j)/(B(i)*D(j)));
%%average entropy calculation
avgent = avgent - (B(i)/N) * (A(i,j)/B(i)) * log2 (A(i,j)/B(i));
else
miarr(i,j) = 0;
end
mi = mi + miarr(i,j);
end
end
%%class entropy
class_ent = 0;
for i=1:num_class
class_ent = class_ent + D(i)/N * log2(N/D(i));
end
%%clustering entropy
clust_ent = 0;
for i=1:num_clust
clust_ent = clust_ent + B(i)/N * log2(N/B(i));
end
%%normalized mutual information
nmi = 2*mi / (clust_ent + class_ent);
function Cont=Contingency(Mem1,Mem2)
if nargin < 2 | min(size(Mem1)) > 1 | min(size(Mem2)) > 1
error('Contingency: Requires two vector arguments')
return
end
Cont=zeros(max(Mem1),max(Mem2));
for i = 1:length(Mem1);
Cont(Mem1(i),Mem2(i))=Cont(Mem1(i),Mem2(i))+1;
end
function [AR,RI,MI,HI]=RandIndex(c1,c2)
%RANDINDEX - calculates Rand Indices to compare two partitions
% ARI=RANDINDEX(c1,c2), where c1,c2 are vectors listing the
% class membership, returns the "Hubert & Arabie adjusted Rand index".
% [AR,RI,MI,HI]=RANDINDEX(c1,c2) returns the adjusted Rand index,
% the unadjusted Rand index, "Mirkin's" index and "Hubert's" index.
%
% See L. Hubert and P. Arabie (1985) "Comparing Partitions" Journal of
% Classification 2:193-218
%(C) David Corney (2000) [email protected]
if nargin < 2 | min(size(c1)) > 1 | min(size(c2)) > 1
error('RandIndex: Requires two vector arguments')
return
end
C=Contingency(c1,c2); %form contingency matrix
n=sum(sum(C));
nis=sum(sum(C,2).^2); %sum of squares of sums of rows
njs=sum(sum(C,1).^2); %sum of squares of sums of columns
t1=nchoosek(n,2); %total number of pairs of entities
t2=sum(sum(C.^2)); %sum over rows & columnns of nij^2
t3=.5*(nis+njs);
%Expected index (for adjustment)
nc=(n*(n^2+1)-(n+1)*nis-(n+1)*njs+2*(nis*njs)/n)/(2*(n-1));
A=t1+t2-t3; %no. agreements
D= -t2+t3; %no. disagreements
if t1==nc
AR=0; %avoid division by zero; if k=1, define Rand = 0
else
AR=(A-nc)/(t1-nc); %adjusted Rand - Hubert & Arabie 1985
end
RI=A/t1; %Rand 1971 %Probability of agreement
MI=D/t1; %Mirkin 1970 %p(disagreement)
HI=(A-D)/t1; %Hubert 1977 %p(agree)-p(disagree)