k_means算法基本原理

1.算法首先随机地选择k个对象作为一个初始簇的中心（或平均值）。
2.对剩余的每个对象根据其与各个簇中心的距离，将它赋给最近的簇。
3.重新计算每个簇的簇中心。
4.重复第2，3步，直到簇中心收敛（不收敛则达到最大迭代次数后结束）。

package k_means;
import java.util.ArrayList;
import java.util.List;
import java.util.function.BiFunction;
//k均值聚类
public class K_Means <K> {
	//簇表
	List<Cluster<K>> clusterList;
	//距离公式
	Distance<K> distance;
	//簇中心公式
	ClusterCenter<K> clusterCenter;
	//精度
	double accuracy=0.001;
	//迭代次数
	int count=0;
	//最大迭代次数
	int MAXCOUNT=1000;
	//数据表
	ArrayList<K> dataList;
	public K_Means() {
	}
	//聚合
	public List<Cluster<K>> clustering(ArrayList<K> list,int num ) {
		this.dataList=list;
		if(num<=1||num>=dataList.size()) {
			return null;
		}
		initialCluster(dataList, num);
		while(!criterionFunction()) {
			upCluster();
			count++;
			this.show();
		}
		clusterList.forEach(e->{
			System.out.println(e.getItems());
		});
		return null;
	}
	//更新簇（簇中心和子集）
	public void upCluster() {
		for( Cluster<K> clu:clusterList) {
			clu.getItems().clear();;
		}
		for(K itme:dataList) {
			Cluster<K> recent=recentClusters(itme);
			recent.add(itme);
		}
		for( Cluster<K> clu:clusterList) {
			clu.upCenter(clusterCenter);
		}
	}
	//准则函数，使生成的结果簇尽可能地紧凑和独立
	public boolean criterionFunction() {
		if(count>MAXCOUNT)
			return true;
		if(clusterList.get(0).getOldCenter()==null) {
			return false;
		}
		double tmp=0.0;
		for(Cluster<K> cluster:clusterList) {
			tmp+=distance.apply(cluster.getOldCenter(), cluster.getCenter()).doubleValue();
		}
		return tmp<this.accuracy;
	}
	//寻找最近簇
	public Cluster<K> recentClusters(K itme) {
		Cluster<K> recent=null;
		double dis=Double.MAX_VALUE;
		for(Cluster<K> cluster:clusterList) {
			double tmp=distance.apply(itme, cluster.getCenter()).doubleValue();
			if(tmp<dis) {
				dis=tmp;
				recent=cluster;
			}
		}
		return recent;
	}
	//初始化簇
	public void initialCluster(ArrayList<K> dataList,int num) {
		clusterList=new ArrayList<Cluster<K>>();
		for(Integer index:K_Means.randomMumber(1,dataList.size(),num)) {
			Cluster<K> cluster=new  Cluster<K>();
			cluster.setCenter(dataList.get(index));
			clusterList.add(cluster);
		}
	}
	//在[start  end]内随机选取 num个不重复数据
	public static ArrayList<Integer> randomMumber(int start,int end,int num){
		ArrayList<Integer> list =new ArrayList<Integer>();
		while(list.size()<num) {
			int tmp=(int)(1+Math.random()*(end-start));
			boolean done=true;
			for(Integer i:list) {
				if(i.intValue()==tmp) {
					done=false;
					break;
				}
			}
			if(done)
				list.add(tmp);
		}
		return list;
	}
	public void setDistance(Distance<K> distance) {
		this.distance = distance;
	}
	public void setClusterCenter(ClusterCenter<K> clusterCenter) {
		this.clusterCenter = clusterCenter;
	}
	public void  show() {
		System.out.println("The ( "+ count+" )iteration");
		for(Cluster<K> clu:clusterList) {
			System.out.println(clu);
		}
	}
}
//簇类
class  Cluster<C> {
	//簇中心
	private C center;
	//旧的簇中心
	private C oldCenter;
	//子集
	private ArrayList<C> items =new ArrayList<C>();
	public ArrayList<C> getItems() {
		return items;
	}
	public void  clearItems() {
		this.items.clear();
	}
	//更新簇中心
	public  C upCenter(ClusterCenter<C> clusterCenter){
		this.oldCenter=this.center;
		this.center = clusterCenter.upCenter(this.items);
		return this.center;
	}
	public C getCenter() {
		return center;
	}
	public void setCenter(C center) {
		this.center= center;
	}
	public C getOldCenter() {
		return oldCenter;
	}
	public void  addAll(Cluster<C> cluster) {
		this.items.addAll(cluster.items);
	}
	public void  add(C e) {
		this.items.add(e);
	}
	public String toString() {
		String str="Cluster Center: "+center;
		for(C itme:items) {
			str+="\n\t"+itme;
		}
		return str;
	}
}
//距离公式
abstract class Distance<D> implements BiFunction<D,D,Number>{
}
//簇中心公式
interface ClusterCenter<T> {
	T upCenter(ArrayList<T> items);
}

puclic class TestK_Means{
 
	public static void main(String[] args) {
		ArrayList<Data> dataList =new ArrayList<Data>();
		dataList.add(new Data(1,1,1));
		dataList.add(new Data(2,2,1));
		dataList.add(new Data(3,1,2));
		dataList.add(new Data(4,2,2));
		dataList.add(new Data(5,4,3));
		dataList.add(new Data(6,5,3));
		dataList.add(new Data(7,4,4));
		dataList.add(new Data(8,5,4));
		System.out.print(randomMumber(1,8,5));
		K_Means<Data> k=new K_Means<Data>();

		k.setDistance(new Distance<Data>() {
			public Number apply(Data t, Data u) {
				//欧氏距离
				return Math.sqrt((t.x-u.x)*(t.x-u.x)+(t.y-u.y)*(t.y-u.y));
			}
		});
		k.setClusterCenter(new ClusterCenter<Data> () {
			public Data upCenter(ArrayList<Data> items) {
				double sumx=0.0,sumy=0.0;
				for(Data itme:items) {
					sumx+=itme.x;
					sumy+=itme.y;
				}
				//新的簇中心
				return new Data(0,sumx/items.size(), sumy/items.size());
			}
		});
		
		k.clustering(dataList, 2);
	}
}
//测试数据类
class Data{
	int id;
	double x, y;
	public Data(int id,double x,double y) {
		this.id=id;
		this.x=x;
		this.y=y;
	}
	public String toString() {
		return "Data "+id+":("+x+","+y+")";
	}
}

测试结果

The ( 1 )iteration
Cluster Center: Data 0:(2.7142857142857144,2.4285714285714284)
	Data 1:(1.0,1.0)
	Data 2:(2.0,1.0)
	Data 3:(1.0,2.0)
	Data 4:(2.0,2.0)
	Data 5:(4.0,3.0)
	Data 7:(4.0,4.0)
	Data 8:(5.0,4.0)
Cluster Center: Data 0:(5.0,3.0)
	Data 6:(5.0,3.0)
The ( 2 )iteration
Cluster Center: Data 0:(1.5,1.5)
	Data 1:(1.0,1.0)
	Data 2:(2.0,1.0)
	Data 3:(1.0,2.0)
	Data 4:(2.0,2.0)
Cluster Center: Data 0:(4.5,3.5)
	Data 5:(4.0,3.0)
	Data 6:(5.0,3.0)
	Data 7:(4.0,4.0)
	Data 8:(5.0,4.0)
The ( 3 )iteration
Cluster Center: Data 0:(1.5,1.5)
	Data 1:(1.0,1.0)
	Data 2:(2.0,1.0)
	Data 3:(1.0,2.0)
	Data 4:(2.0,2.0)
Cluster Center: Data 0:(4.5,3.5)
	Data 5:(4.0,3.0)
	Data 6:(5.0,3.0)
	Data 7:(4.0,4.0)
	Data 8:(5.0,4.0)

在第三次迭代时簇中心已经收敛，迭代结束。

k-means算法的性能分析

主要优点：
- 是解决聚类问题的一种经典算法，简单、快速。
- 对处理大数据集，该算法是相对可伸缩和高效率的。
- 当结果簇是密集的，它的效果较好。

主要缺点：
- 在簇的平均值被定义的情况下才能使用，可能不适用于某些应用。
- 必须事先给出k（要生成的簇的数目），而且对初值敏感，对于不同的初始值，可能会导致不同结果（初始值的选取也会影响收敛速度）。
- 不适合于发现非凸面形状的簇或者大小差别很大的簇。而且，它对于“躁声”和孤立点数据是敏感的。

k_means算法

k_means算法基本原理

测试结果

k-means算法的性能分析

猜你喜欢