继续使用4.1中的数据,聚类算法使用的数据结构是树。
4.3.1 树状图:基本的聚类数据结构
定义集合[d,k,{…}],d代表每个聚类中元素间相似度的最小值(距离的最大值),k代表聚类的数量,{…}是每个聚类集合的集合。
层次凝聚算法采用自底而上的方向:
1、初始树状图,每个元素独占一个聚类。
2、d++,形成新的聚类
3、树状图更新(长高一层)
4、重复2、3直到所有元素都属于一个大类
这个数据结构的代码:
package iweb2.ch4.clustering.hierarchical; import iweb2.ch4.model.Cluster; import java.util.ArrayList; import java.util.Collection; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; public class Dendrogram { /** * 分层聚类 */ private Map<Integer, ClusterSet> entryMap; /** * 每层聚类叫什么名字 */ private Map<Integer, String> levelLabels; /** * 下一层高度 */ private Integer nextLevel; /** * 这个树状图叫什么 */ private String levelLabelName; public Dendrogram(String levelLabelName) { entryMap = new LinkedHashMap<Integer, ClusterSet>(); levelLabels = new LinkedHashMap<Integer, String>(); nextLevel = 1; this.levelLabelName = levelLabelName; } public int addLevel(String label, Cluster cluster) { List<Cluster> values = new ArrayList<Cluster>(); values.add(cluster); return addLevel(label, values); } /** * * Creates a new dendrogram level using copies of provided clusters. * 使用提供的聚类创建一个新的树状图 * @param label * @param clusters * @return 新的高度 */ public int addLevel(String label, Collection<Cluster> clusters) { ClusterSet clusterSet = new ClusterSet(); for (Cluster c : clusters) { // copy cluster before adding - over time cluster elements may change // but for dendrogram we want to keep current state. // 在添加之前复制聚类——随着时间推移,聚类中的元素可能发生变化 // 但对于树状图,我们希望能保持当前的状态 clusterSet.add(c.copy()); } int level = nextLevel; entryMap.put(level, clusterSet); levelLabels.put(level, label); nextLevel++; return level; } /** * Replaces clusters in the specified level. If level doesn't exist it will * be created. * 替换某一层聚类 * * @param level dendrogram level. * @param label level description. * @param clusters clusters for the level. * @return */ public void setLevel(int level, String label, Collection<Cluster> clusters) { ClusterSet clusterSet = new ClusterSet(); for (Cluster c : clusters) { clusterSet.add(c.copy()); } System.out.println("Setting cluster level: " + level); entryMap.put(level, clusterSet); levelLabels.put(level, label); if (level >= nextLevel) { nextLevel = level + 1; } } public void printAll() { for (Map.Entry<Integer, ClusterSet> e : entryMap.entrySet()) { Integer level = e.getKey(); print(level); } } public void print(int level) { String label = levelLabels.get(level); ClusterSet clusters = entryMap.get(level); System.out.println("Clusters for: level=" + level + ", " + levelLabelName + "=" + label); for (Cluster c : clusters.getAllClusters()) { if (c.getElements().size() > 1) { System.out.println("____________________________________________________________\n"); System.out.println(c.getElementsAsString()); System.out.println("____________________________________________________________\n\n"); } } } public int getTopLevel() { return nextLevel - 1; } public List<Integer> getAllLevels() { return new ArrayList<Integer>(entryMap.keySet()); } public String getLabelForLevel(int level) { return levelLabels.get(level); } public List<Cluster> getClustersForLevel(int level) { ClusterSet cs = entryMap.get(level); return cs.getAllClusters(); } }
4.3.2 基于链接的算法概况
层次凝聚聚类算法的主程序:
package com.hankcs; import iweb2.ch4.clustering.hierarchical.AverageLinkAlgorithm; import iweb2.ch4.clustering.hierarchical.Dendrogram; import iweb2.ch4.clustering.hierarchical.MSTSingleLinkAlgorithm; import iweb2.ch4.clustering.hierarchical.SingleLinkAlgorithm; import iweb2.ch4.data.SFData; import iweb2.ch4.data.SFDataset; import iweb2.ch4.model.DataPoint; public class ch4_2_AverageLink { public static void main(String[] args) throws Exception { // 加载数据 SFDataset ds = SFData.createDataset(); DataPoint[] dps = ds.getData(); double[][] adjMatrix = ds.getAdjacencyMatrix(); // 单链接聚类 SingleLinkAlgorithm sla = new SingleLinkAlgorithm(dps,adjMatrix); Dendrogram dendroSLA = sla.cluster(); dendroSLA.print(4); // 带MST的单链接聚类 MSTSingleLinkAlgorithm sla2 = new MSTSingleLinkAlgorithm(dps,adjMatrix); Dendrogram dendroSLA2 = sla2.cluster(); dendroSLA2.print(4); // 平均链接聚类 AverageLinkAlgorithm ala = new AverageLinkAlgorithm(dps,adjMatrix); Dendrogram dendroALA = ala.cluster(); dendroALA.print(4); } }
这里用到了三种算法,处理的数据都是4.1中的数据。
输出:
From file: C:/iWeb2/data/ch04/clusteringSF.dat Using attribute names: [Age, IncomeRange, Education, Skills, Social, isPaid] Loaded 20 data points. Clusters for: level=4, Distance=3.0 ____________________________________________________________ {Alexandra, Aurora} ____________________________________________________________ ____________________________________________________________ {Bill, Bob, Carl, Catherine, Charlie} ____________________________________________________________ ____________________________________________________________ {Constantine, Elena, Eric} ____________________________________________________________ ____________________________________________________________ {George, Jack, Maria} ____________________________________________________________ ____________________________________________________________ {Albert, Athena, Babis} ____________________________________________________________ ____________________________________________________________ {John, Lukas} ____________________________________________________________ Setting cluster level: 2 Setting cluster level: 3 Setting cluster level: 4 Setting cluster level: 4 Setting cluster level: 5 Setting cluster level: 6 Setting cluster level: 6 Setting cluster level: 6 Setting cluster level: 7 Clusters for: level=4, Distance=1.7320508075688772 ____________________________________________________________ {Bill, Carl, Charlie, Catherine} ____________________________________________________________ ____________________________________________________________ {Constantine, Elena, Eric} ____________________________________________________________ ____________________________________________________________ {John, Lukas} ____________________________________________________________ ____________________________________________________________ {Albert, Athena} ____________________________________________________________ Clusters for: level=4, Distance=2.0 ____________________________________________________________ {Bill, Carl, Catherine} ____________________________________________________________ ____________________________________________________________ {Constantine, Elena, Eric} ____________________________________________________________ ____________________________________________________________ {George, Jack} ____________________________________________________________ ____________________________________________________________ {Albert, Athena} ____________________________________________________________ ____________________________________________________________ {John, Lukas} ____________________________________________________________
4.3.3 单链接算法
单链接算法的目的是找出树状图中的连通分量,并且将这两个顶点合并。具体说来,先构造一个邻接矩阵,代表元素ij之间的距离。然后定义距离阀值,如果距离小于阀值,则两个顶点合并到一个聚类中。最终的结果是层次数递增的多个聚类。
package iweb2.ch4.clustering.hierarchical; import iweb2.ch4.model.Cluster; import iweb2.ch4.model.DataPoint; import java.util.ArrayList; import java.util.List; /** * A hierarchical agglomerative clustering algorithm based on single link * 单链接算法 */ public class SingleLinkAlgorithm { /** * 所有元素 */ private DataPoint[] elements; /** * 邻接矩阵 */ private double[][] a; // Hierarchical Agglomerative Algorithm // 单链接凝聚算法 public SingleLinkAlgorithm(DataPoint[] elements, double[][] adjacencyMatrix) { this.elements = elements; this.a = adjacencyMatrix; } /** * 聚类 * @return 树状图 */ public Dendrogram cluster() { Dendrogram dnd = new Dendrogram("Distance"); double d = 0; // initially load all elements as individual clusters // 初始每个元素独占一个聚类 List<Cluster> initialClusters = new ArrayList<Cluster>(); for (DataPoint e : elements) { Cluster c = new Cluster(e); initialClusters.add(c); } dnd.addLevel(String.valueOf(d), initialClusters); d = 1.0; int k = initialClusters.size(); while (k > 1) { int oldK = k; List<Cluster> clusters = buildClusters(d); k = clusters.size(); if (oldK != k) { dnd.addLevel(String.valueOf(d), clusters); } d = d + 1; } return dnd; } // Implements Single Link Technique /** * 建立聚类 * @param distanceThreshold 距离最大值 * @return 合并后的聚类 */ private List<Cluster> buildClusters(double distanceThreshold) { boolean[] usedElementFlags = new boolean[elements.length]; List<Cluster> clusters = new ArrayList<Cluster>(); for (int i = 0, n = a.length; i < n; i++) { List<DataPoint> clusterPoints = new ArrayList<DataPoint>(); for (int j = i, k = a.length; j < k; j++) { // 如果两点距离小于阀值 if (a[i][j] <= distanceThreshold && usedElementFlags[j] == false) { // 准备加入同一个聚类 clusterPoints.add(elements[j]); usedElementFlags[j] = true; } } if (clusterPoints.size() > 0) { // 加入同一个聚类 Cluster c = new Cluster(clusterPoints); clusters.add(c); } } return clusters; } public static void main(String[] args) { //Define data DataPoint[] elements = new DataPoint[5]; elements[0] = new DataPoint("A", new double[]{}); elements[1] = new DataPoint("B", new double[]{}); elements[2] = new DataPoint("C", new double[]{}); elements[3] = new DataPoint("D", new double[]{}); elements[4] = new DataPoint("E", new double[]{}); double[][] a = new double[][]{ {0, 1, 2, 2, 3}, {1, 0, 2, 4, 3}, {2, 2, 0, 1, 5}, {2, 4, 1, 0, 3}, {3, 3, 5, 3, 0} }; SingleLinkAlgorithm ca = new SingleLinkAlgorithm(elements, a); Dendrogram dnd = ca.cluster(); dnd.printAll(); //dnd.print(3); } }
4.3.4 平均链接算法
跟单链接算法不同的是,合并的条件是聚类的平均距离是否低于阀值:
/** * 合并聚类 * @param distanceThreshold */ private void mergeClusters(double distanceThreshold) { int nClusters = allClusters.size(); ObjectToIndexMapping<Cluster> idxMapping = new ObjectToIndexMapping<Cluster>(); /** * 聚类之间的距离 */ double[][] clusterDistances = new double[nClusters][nClusters]; for (int i = 0, n = a.length; i < n; i++) { for (int j = i + 1, k = a.length; j < k; j++) { double d = a[i][j]; if (d > 0) { DataPoint e1 = elements[i]; DataPoint e2 = elements[j]; Cluster c1 = allClusters.findClusterByElement(e1); Cluster c2 = allClusters.findClusterByElement(e2); if (!c1.equals(c2)) { int ci = idxMapping.getIndex(c1); int cj = idxMapping.getIndex(c2); clusterDistances[ci][cj] += d; clusterDistances[cj][ci] += d; } } } } boolean[] merged = new boolean[clusterDistances.length]; for (int i = 0, n = clusterDistances.length; i < n; i++) { for (int j = i + 1, k = clusterDistances.length; j < k; j++) { Cluster ci = idxMapping.getObject(i); Cluster cj = idxMapping.getObject(j); int ni = ci.size(); int nj = cj.size(); // 聚类间的平均距离 clusterDistances[i][j] = clusterDistances[i][j] / (ni * nj); clusterDistances[j][i] = clusterDistances[i][j]; // merge clusters if distance is below the threshold if (merged[i] == false && merged[j] == false) { if (clusterDistances[i][j] <= distanceThreshold) { allClusters.remove(ci); allClusters.remove(cj); Cluster mergedCluster = new Cluster(ci, cj); allClusters.add(mergedCluster); merged[i] = true; merged[j] = true; } } } } }
4.3.5 最小生成树算法
MST单链接算法是单链接算法基于最小生成树的实现,用Prim算法将邻接矩阵构造成MST,根据MST中权重由小到大的顺序合并聚类(也就是最近的两个元素所属的聚类合并,只要它们分属不同的聚类)。
package iweb2.ch4.clustering.hierarchical; import iweb2.ch4.model.Cluster; import iweb2.ch4.model.DataPoint; public class MSTSingleLinkAlgorithm { private DataPoint[] elements; private double[][] a; private double[][] m; private ClusterSet allClusters; public MSTSingleLinkAlgorithm(DataPoint[] elements, double[][] adjacencyMatrix) { this.elements = elements; this.a = adjacencyMatrix; this.allClusters = new ClusterSet(); } public Dendrogram cluster() { // 建立元素的最小生成树 m = (new MST()).buildMST(a); Dendrogram dnd = new Dendrogram("Distance"); double d = 0.0; // initially load all elements as individual clusters for (DataPoint e : elements) { Cluster c = new Cluster(e); allClusters.add(c); } int lastDndLevel = dnd.addLevel(String.valueOf(d), allClusters.getAllClusters()); double previousD = d; while (allClusters.size() > 1) { d = mergeTwoClosestClusters(); if (previousD == d) { dnd.setLevel(lastDndLevel, String.valueOf(d), allClusters.getAllClusters()); } else { lastDndLevel = dnd.addLevel(String.valueOf(d), allClusters.getAllClusters()); } previousD = d; } return dnd; } /** * 合并最相近的两个聚类(最近的两个点所属的聚类) * @return */ private double mergeTwoClosestClusters() { int minI = -1; int minJ = -1; double minWeight = Double.POSITIVE_INFINITY; for (int i = 0, n = m.length; i < n; i++) { for (int j = 0, k = m.length; j < k; j++) { if (m[i][j] >= 0 && minWeight > m[i][j]) { minI = i; minJ = j; minWeight = m[i][j]; } } } double d = Double.NaN; if (minI > -1) { DataPoint e1 = elements[minI]; Cluster c1 = allClusters.findClusterByElement(e1); DataPoint e2 = elements[minJ]; Cluster c2 = allClusters.findClusterByElement(e2); allClusters.remove(c1); allClusters.remove(c2); d = minWeight; Cluster mergedCluster = new Cluster(c1, c2); allClusters.add(mergedCluster); m[minI][minJ] = -1; // remove link. Using -1 because 0 is a valid distance. m[minJ][minI] = -1; // remove link. Using -1 because 0 is a valid distance. } return d; } public static void main(String[] args) { //Define data DataPoint[] elements = new DataPoint[5]; elements[0] = new DataPoint("A", new double[]{}); elements[1] = new DataPoint("B", new double[]{}); elements[2] = new DataPoint("C", new double[]{}); elements[3] = new DataPoint("D", new double[]{}); elements[4] = new DataPoint("E", new double[]{}); double[][] a = new double[][]{ {0, 1, 2, 2, 3}, {1, 0, 2, 4, 3}, {2, 2, 0, 1, 5}, {2, 4, 1, 0, 3}, {3, 3, 5, 3, 0} }; MSTSingleLinkAlgorithm ca = new MSTSingleLinkAlgorithm(elements, a); Dendrogram dnd = ca.cluster(); dnd.printAll(); } }
单链接算法会导致“连锁效应”,因为每次合并的是最近的两个点所属的聚类,那么无论这两个聚类中其他点多么远,它们总会被合并。