继续使用4.1中的数据,聚类算法使用的数据结构是树。

4.3.1 树状图:基本的聚类数据结构
定义集合[d,k,{…}],d代表每个聚类中元素间相似度的最小值(距离的最大值),k代表聚类的数量,{…}是每个聚类集合的集合。
层次凝聚算法采用自底而上的方向:
1、初始树状图,每个元素独占一个聚类。
2、d++,形成新的聚类
3、树状图更新(长高一层)
4、重复2、3直到所有元素都属于一个大类
这个数据结构的代码:
package iweb2.ch4.clustering.hierarchical;
import iweb2.ch4.model.Cluster;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public class Dendrogram
{
/**
* 分层聚类
*/
private Map<Integer, ClusterSet> entryMap;
/**
* 每层聚类叫什么名字
*/
private Map<Integer, String> levelLabels;
/**
* 下一层高度
*/
private Integer nextLevel;
/**
* 这个树状图叫什么
*/
private String levelLabelName;
public Dendrogram(String levelLabelName)
{
entryMap = new LinkedHashMap<Integer, ClusterSet>();
levelLabels = new LinkedHashMap<Integer, String>();
nextLevel = 1;
this.levelLabelName = levelLabelName;
}
public int addLevel(String label, Cluster cluster)
{
List<Cluster> values = new ArrayList<Cluster>();
values.add(cluster);
return addLevel(label, values);
}
/**
* * Creates a new dendrogram level using copies of provided clusters.
* 使用提供的聚类创建一个新的树状图
* @param label
* @param clusters
* @return 新的高度
*/
public int addLevel(String label, Collection<Cluster> clusters)
{
ClusterSet clusterSet = new ClusterSet();
for (Cluster c : clusters)
{
// copy cluster before adding - over time cluster elements may change
// but for dendrogram we want to keep current state.
// 在添加之前复制聚类——随着时间推移,聚类中的元素可能发生变化
// 但对于树状图,我们希望能保持当前的状态
clusterSet.add(c.copy());
}
int level = nextLevel;
entryMap.put(level, clusterSet);
levelLabels.put(level, label);
nextLevel++;
return level;
}
/**
* Replaces clusters in the specified level. If level doesn't exist it will
* be created.
* 替换某一层聚类
*
* @param level dendrogram level.
* @param label level description.
* @param clusters clusters for the level.
* @return
*/
public void setLevel(int level, String label, Collection<Cluster> clusters)
{
ClusterSet clusterSet = new ClusterSet();
for (Cluster c : clusters)
{
clusterSet.add(c.copy());
}
System.out.println("Setting cluster level: " + level);
entryMap.put(level, clusterSet);
levelLabels.put(level, label);
if (level >= nextLevel)
{
nextLevel = level + 1;
}
}
public void printAll()
{
for (Map.Entry<Integer, ClusterSet> e : entryMap.entrySet())
{
Integer level = e.getKey();
print(level);
}
}
public void print(int level)
{
String label = levelLabels.get(level);
ClusterSet clusters = entryMap.get(level);
System.out.println("Clusters for: level=" + level + ", " +
levelLabelName + "=" + label);
for (Cluster c : clusters.getAllClusters())
{
if (c.getElements().size() > 1)
{
System.out.println("____________________________________________________________\n");
System.out.println(c.getElementsAsString());
System.out.println("____________________________________________________________\n\n");
}
}
}
public int getTopLevel()
{
return nextLevel - 1;
}
public List<Integer> getAllLevels()
{
return new ArrayList<Integer>(entryMap.keySet());
}
public String getLabelForLevel(int level)
{
return levelLabels.get(level);
}
public List<Cluster> getClustersForLevel(int level)
{
ClusterSet cs = entryMap.get(level);
return cs.getAllClusters();
}
}
4.3.2 基于链接的算法概况
层次凝聚聚类算法的主程序:
package com.hankcs;
import iweb2.ch4.clustering.hierarchical.AverageLinkAlgorithm;
import iweb2.ch4.clustering.hierarchical.Dendrogram;
import iweb2.ch4.clustering.hierarchical.MSTSingleLinkAlgorithm;
import iweb2.ch4.clustering.hierarchical.SingleLinkAlgorithm;
import iweb2.ch4.data.SFData;
import iweb2.ch4.data.SFDataset;
import iweb2.ch4.model.DataPoint;
public class ch4_2_AverageLink
{
public static void main(String[] args) throws Exception
{
// 加载数据
SFDataset ds = SFData.createDataset();
DataPoint[] dps = ds.getData();
double[][] adjMatrix = ds.getAdjacencyMatrix();
// 单链接聚类
SingleLinkAlgorithm sla = new SingleLinkAlgorithm(dps,adjMatrix);
Dendrogram dendroSLA = sla.cluster();
dendroSLA.print(4);
// 带MST的单链接聚类
MSTSingleLinkAlgorithm sla2 = new MSTSingleLinkAlgorithm(dps,adjMatrix);
Dendrogram dendroSLA2 = sla2.cluster();
dendroSLA2.print(4);
// 平均链接聚类
AverageLinkAlgorithm ala = new AverageLinkAlgorithm(dps,adjMatrix);
Dendrogram dendroALA = ala.cluster();
dendroALA.print(4);
}
}
这里用到了三种算法,处理的数据都是4.1中的数据。
输出:
From file: C:/iWeb2/data/ch04/clusteringSF.dat
Using attribute names: [Age, IncomeRange, Education, Skills, Social, isPaid]
Loaded 20 data points.
Clusters for: level=4, Distance=3.0
____________________________________________________________
{Alexandra,
Aurora}
____________________________________________________________
____________________________________________________________
{Bill,
Bob,
Carl,
Catherine,
Charlie}
____________________________________________________________
____________________________________________________________
{Constantine,
Elena,
Eric}
____________________________________________________________
____________________________________________________________
{George,
Jack,
Maria}
____________________________________________________________
____________________________________________________________
{Albert,
Athena,
Babis}
____________________________________________________________
____________________________________________________________
{John,
Lukas}
____________________________________________________________
Setting cluster level: 2
Setting cluster level: 3
Setting cluster level: 4
Setting cluster level: 4
Setting cluster level: 5
Setting cluster level: 6
Setting cluster level: 6
Setting cluster level: 6
Setting cluster level: 7
Clusters for: level=4, Distance=1.7320508075688772
____________________________________________________________
{Bill,
Carl,
Charlie,
Catherine}
____________________________________________________________
____________________________________________________________
{Constantine,
Elena,
Eric}
____________________________________________________________
____________________________________________________________
{John,
Lukas}
____________________________________________________________
____________________________________________________________
{Albert,
Athena}
____________________________________________________________
Clusters for: level=4, Distance=2.0
____________________________________________________________
{Bill,
Carl,
Catherine}
____________________________________________________________
____________________________________________________________
{Constantine,
Elena,
Eric}
____________________________________________________________
____________________________________________________________
{George,
Jack}
____________________________________________________________
____________________________________________________________
{Albert,
Athena}
____________________________________________________________
____________________________________________________________
{John,
Lukas}
____________________________________________________________
4.3.3 单链接算法
单链接算法的目的是找出树状图中的连通分量,并且将这两个顶点合并。具体说来,先构造一个邻接矩阵,代表元素ij之间的距离。然后定义距离阀值,如果距离小于阀值,则两个顶点合并到一个聚类中。最终的结果是层次数递增的多个聚类。
package iweb2.ch4.clustering.hierarchical;
import iweb2.ch4.model.Cluster;
import iweb2.ch4.model.DataPoint;
import java.util.ArrayList;
import java.util.List;
/**
* A hierarchical agglomerative clustering algorithm based on single link
* 单链接算法
*/
public class SingleLinkAlgorithm
{
/**
* 所有元素
*/
private DataPoint[] elements;
/**
* 邻接矩阵
*/
private double[][] a;
// Hierarchical Agglomerative Algorithm
// 单链接凝聚算法
public SingleLinkAlgorithm(DataPoint[] elements, double[][] adjacencyMatrix)
{
this.elements = elements;
this.a = adjacencyMatrix;
}
/**
* 聚类
* @return 树状图
*/
public Dendrogram cluster()
{
Dendrogram dnd = new Dendrogram("Distance");
double d = 0;
// initially load all elements as individual clusters
// 初始每个元素独占一个聚类
List<Cluster> initialClusters = new ArrayList<Cluster>();
for (DataPoint e : elements)
{
Cluster c = new Cluster(e);
initialClusters.add(c);
}
dnd.addLevel(String.valueOf(d), initialClusters);
d = 1.0;
int k = initialClusters.size();
while (k > 1)
{
int oldK = k;
List<Cluster> clusters = buildClusters(d);
k = clusters.size();
if (oldK != k)
{
dnd.addLevel(String.valueOf(d), clusters);
}
d = d + 1;
}
return dnd;
}
// Implements Single Link Technique
/**
* 建立聚类
* @param distanceThreshold 距离最大值
* @return 合并后的聚类
*/
private List<Cluster> buildClusters(double distanceThreshold)
{
boolean[] usedElementFlags = new boolean[elements.length];
List<Cluster> clusters = new ArrayList<Cluster>();
for (int i = 0, n = a.length; i < n; i++)
{
List<DataPoint> clusterPoints = new ArrayList<DataPoint>();
for (int j = i, k = a.length; j < k; j++)
{
// 如果两点距离小于阀值
if (a[i][j] <= distanceThreshold && usedElementFlags[j] == false)
{
// 准备加入同一个聚类
clusterPoints.add(elements[j]);
usedElementFlags[j] = true;
}
}
if (clusterPoints.size() > 0)
{
// 加入同一个聚类
Cluster c = new Cluster(clusterPoints);
clusters.add(c);
}
}
return clusters;
}
public static void main(String[] args)
{
//Define data
DataPoint[] elements = new DataPoint[5];
elements[0] = new DataPoint("A", new double[]{});
elements[1] = new DataPoint("B", new double[]{});
elements[2] = new DataPoint("C", new double[]{});
elements[3] = new DataPoint("D", new double[]{});
elements[4] = new DataPoint("E", new double[]{});
double[][] a = new double[][]{
{0, 1, 2, 2, 3},
{1, 0, 2, 4, 3},
{2, 2, 0, 1, 5},
{2, 4, 1, 0, 3},
{3, 3, 5, 3, 0}
};
SingleLinkAlgorithm ca = new SingleLinkAlgorithm(elements, a);
Dendrogram dnd = ca.cluster();
dnd.printAll();
//dnd.print(3);
}
}
4.3.4 平均链接算法
跟单链接算法不同的是,合并的条件是聚类的平均距离是否低于阀值:
/**
* 合并聚类
* @param distanceThreshold
*/
private void mergeClusters(double distanceThreshold)
{
int nClusters = allClusters.size();
ObjectToIndexMapping<Cluster> idxMapping =
new ObjectToIndexMapping<Cluster>();
/**
* 聚类之间的距离
*/
double[][] clusterDistances = new double[nClusters][nClusters];
for (int i = 0, n = a.length; i < n; i++)
{
for (int j = i + 1, k = a.length; j < k; j++)
{
double d = a[i][j];
if (d > 0)
{
DataPoint e1 = elements[i];
DataPoint e2 = elements[j];
Cluster c1 = allClusters.findClusterByElement(e1);
Cluster c2 = allClusters.findClusterByElement(e2);
if (!c1.equals(c2))
{
int ci = idxMapping.getIndex(c1);
int cj = idxMapping.getIndex(c2);
clusterDistances[ci][cj] += d;
clusterDistances[cj][ci] += d;
}
}
}
}
boolean[] merged = new boolean[clusterDistances.length];
for (int i = 0, n = clusterDistances.length; i < n; i++)
{
for (int j = i + 1, k = clusterDistances.length; j < k; j++)
{
Cluster ci = idxMapping.getObject(i);
Cluster cj = idxMapping.getObject(j);
int ni = ci.size();
int nj = cj.size();
// 聚类间的平均距离
clusterDistances[i][j] = clusterDistances[i][j] / (ni * nj);
clusterDistances[j][i] = clusterDistances[i][j];
// merge clusters if distance is below the threshold
if (merged[i] == false && merged[j] == false)
{
if (clusterDistances[i][j] <= distanceThreshold)
{
allClusters.remove(ci);
allClusters.remove(cj);
Cluster mergedCluster = new Cluster(ci, cj);
allClusters.add(mergedCluster);
merged[i] = true;
merged[j] = true;
}
}
}
}
}
4.3.5 最小生成树算法
MST单链接算法是单链接算法基于最小生成树的实现,用Prim算法将邻接矩阵构造成MST,根据MST中权重由小到大的顺序合并聚类(也就是最近的两个元素所属的聚类合并,只要它们分属不同的聚类)。
package iweb2.ch4.clustering.hierarchical;
import iweb2.ch4.model.Cluster;
import iweb2.ch4.model.DataPoint;
public class MSTSingleLinkAlgorithm
{
private DataPoint[] elements;
private double[][] a;
private double[][] m;
private ClusterSet allClusters;
public MSTSingleLinkAlgorithm(DataPoint[] elements, double[][] adjacencyMatrix)
{
this.elements = elements;
this.a = adjacencyMatrix;
this.allClusters = new ClusterSet();
}
public Dendrogram cluster()
{
// 建立元素的最小生成树
m = (new MST()).buildMST(a);
Dendrogram dnd = new Dendrogram("Distance");
double d = 0.0;
// initially load all elements as individual clusters
for (DataPoint e : elements)
{
Cluster c = new Cluster(e);
allClusters.add(c);
}
int lastDndLevel = dnd.addLevel(String.valueOf(d), allClusters.getAllClusters());
double previousD = d;
while (allClusters.size() > 1)
{
d = mergeTwoClosestClusters();
if (previousD == d)
{
dnd.setLevel(lastDndLevel, String.valueOf(d), allClusters.getAllClusters());
}
else
{
lastDndLevel = dnd.addLevel(String.valueOf(d), allClusters.getAllClusters());
}
previousD = d;
}
return dnd;
}
/**
* 合并最相近的两个聚类(最近的两个点所属的聚类)
* @return
*/
private double mergeTwoClosestClusters()
{
int minI = -1;
int minJ = -1;
double minWeight = Double.POSITIVE_INFINITY;
for (int i = 0, n = m.length; i < n; i++)
{
for (int j = 0, k = m.length; j < k; j++)
{
if (m[i][j] >= 0 && minWeight > m[i][j])
{
minI = i;
minJ = j;
minWeight = m[i][j];
}
}
}
double d = Double.NaN;
if (minI > -1)
{
DataPoint e1 = elements[minI];
Cluster c1 = allClusters.findClusterByElement(e1);
DataPoint e2 = elements[minJ];
Cluster c2 = allClusters.findClusterByElement(e2);
allClusters.remove(c1);
allClusters.remove(c2);
d = minWeight;
Cluster mergedCluster =
new Cluster(c1, c2);
allClusters.add(mergedCluster);
m[minI][minJ] = -1; // remove link. Using -1 because 0 is a valid distance.
m[minJ][minI] = -1; // remove link. Using -1 because 0 is a valid distance.
}
return d;
}
public static void main(String[] args)
{
//Define data
DataPoint[] elements = new DataPoint[5];
elements[0] = new DataPoint("A", new double[]{});
elements[1] = new DataPoint("B", new double[]{});
elements[2] = new DataPoint("C", new double[]{});
elements[3] = new DataPoint("D", new double[]{});
elements[4] = new DataPoint("E", new double[]{});
double[][] a = new double[][]{
{0, 1, 2, 2, 3},
{1, 0, 2, 4, 3},
{2, 2, 0, 1, 5},
{2, 4, 1, 0, 3},
{3, 3, 5, 3, 0}
};
MSTSingleLinkAlgorithm ca = new MSTSingleLinkAlgorithm(elements, a);
Dendrogram dnd = ca.cluster();
dnd.printAll();
}
}
单链接算法会导致“连锁效应”,因为每次合并的是最近的两个点所属的聚类,那么无论这两个聚类中其他点多么远,它们总会被合并。
码农场