use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.
the class NaiveAgglomerativeHierarchicalClustering2 method run.
/**
* Run the algorithm
*
* @param db Database
* @param relation Relation
* @return Clustering hierarchy
*/
public Result run(Database db, Relation<O> relation) {
DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
final int size = ids.size();
if (size > 0x10000) {
throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow.");
}
LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
// Compute the initial (lower triangular) distance matrix.
double[] scratch = new double[triangleSize(size)];
DBIDArrayIter ix = ids.iter(), iy = ids.iter();
// Position counter - must agree with computeOffset!
int pos = 0;
for (int x = 0; ix.valid(); x++, ix.advance()) {
iy.seek(0);
for (int y = 0; y < x; y++, iy.advance()) {
scratch[pos] = dq.distance(ix, iy);
pos++;
}
}
// Initialize space for result:
double[] height = new double[size];
Arrays.fill(height, Double.POSITIVE_INFINITY);
// Parent node, to track merges
// have every object point to itself initially
ArrayModifiableDBIDs parent = DBIDUtil.newArray(ids);
// Active clusters, when not trivial.
Int2ReferenceMap<ModifiableDBIDs> clusters = new Int2ReferenceOpenHashMap<>();
// Repeat until everything merged, except the desired number of clusters:
final int stop = size - numclusters;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", stop, LOG) : null;
for (int i = 0; i < stop; i++) {
double min = Double.POSITIVE_INFINITY;
int minx = -1, miny = -1;
for (int x = 0; x < size; x++) {
if (height[x] < Double.POSITIVE_INFINITY) {
continue;
}
final int xbase = triangleSize(x);
for (int y = 0; y < x; y++) {
if (height[y] < Double.POSITIVE_INFINITY) {
continue;
}
final int idx = xbase + y;
if (scratch[idx] < min) {
min = scratch[idx];
minx = x;
miny = y;
}
}
}
assert (minx >= 0 && miny >= 0);
// Avoid allocating memory, by reusing existing iterators:
ix.seek(minx);
iy.seek(miny);
// Perform merge in data structure: x -> y
// Since y < x, prefer keeping y, dropping x.
height[minx] = min;
parent.set(minx, iy);
// Merge into cluster
ModifiableDBIDs cx = clusters.get(minx);
ModifiableDBIDs cy = clusters.get(miny);
if (cy == null) {
cy = DBIDUtil.newHashSet();
cy.add(iy);
}
if (cx == null) {
cy.add(ix);
} else {
cy.addDBIDs(cx);
clusters.remove(minx);
}
clusters.put(miny, cy);
// Update distance matrix. Note: miny < minx
final int xbase = triangleSize(minx), ybase = triangleSize(miny);
// Write to (y, j), with j < y
for (int j = 0; j < miny; j++) {
if (height[j] < Double.POSITIVE_INFINITY) {
continue;
}
scratch[ybase + j] = Math.min(scratch[xbase + j], scratch[ybase + j]);
}
// Write to (j, y), with y < j < x
for (int j = miny + 1; j < minx; j++) {
if (height[j] < Double.POSITIVE_INFINITY) {
continue;
}
final int jbase = triangleSize(j);
scratch[jbase + miny] = Math.min(scratch[xbase + j], scratch[jbase + miny]);
}
// Write to (j, y), with y < x < j
for (int j = minx + 1; j < size; j++) {
if (height[j] < Double.POSITIVE_INFINITY) {
continue;
}
final int jbase = triangleSize(j);
scratch[jbase + miny] = Math.min(scratch[jbase + minx], scratch[jbase + miny]);
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
// Build the clustering result
final Clustering<Model> dendrogram = new Clustering<>("Hierarchical-Clustering", "hierarchical-clustering");
for (int x = 0; x < size; x++) {
if (height[x] < Double.POSITIVE_INFINITY) {
DBIDs cids = clusters.get(x);
if (cids == null) {
ix.seek(x);
cids = DBIDUtil.deref(ix);
}
Cluster<Model> cluster = new Cluster<>("Cluster", cids);
dendrogram.addToplevelCluster(cluster);
}
}
return dendrogram;
}
use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.
the class XMeans method run.
/**
* Run the algorithm on a database and relation.
*
* @param database Database to process
* @param relation Data relation
* @return Clustering result.
*/
@Override
public Clustering<M> run(Database database, Relation<V> relation) {
MutableProgress prog = LOG.isVerbose() ? new MutableProgress("X-means number of clusters", k_max, LOG) : null;
// Run initial k-means to find at least k_min clusters
innerKMeans.setK(k_min);
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
splitInitializer.setInitialMeans(initializer.chooseInitialMeans(database, relation, k_min, getDistanceFunction()));
Clustering<M> clustering = innerKMeans.run(database, relation);
if (prog != null) {
prog.setProcessed(k_min, LOG);
}
ArrayList<Cluster<M>> clusters = new ArrayList<>(clustering.getAllClusters());
while (clusters.size() <= k_max) {
// Improve-Structure:
ArrayList<Cluster<M>> nextClusters = new ArrayList<>();
for (Cluster<M> cluster : clusters) {
// Try to split this cluster:
List<Cluster<M>> childClusterList = splitCluster(cluster, database, relation);
nextClusters.addAll(childClusterList);
if (childClusterList.size() > 1) {
k += childClusterList.size() - 1;
if (prog != null) {
if (k >= k_max) {
prog.setTotal(k + 1);
}
prog.setProcessed(k, LOG);
}
}
}
if (clusters.size() == nextClusters.size()) {
break;
}
// Improve-Params:
splitInitializer.setInitialClusters(nextClusters);
innerKMeans.setK(nextClusters.size());
clustering = innerKMeans.run(database, relation);
clusters.clear();
clusters.addAll(clustering.getAllClusters());
}
// Ensure that the progress bar finished.
if (prog != null) {
prog.setTotal(k);
prog.setProcessed(k, LOG);
}
if (LOG.isDebugging()) {
LOG.debug("X-means returned k=" + k + " clusters.");
}
// add all current clusters to the result
Clustering<M> result = new Clustering<>("X-Means Result", "X-Means", clusters);
return result;
}
use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.
the class LMCLUS method run.
/**
* The main LMCLUS (Linear manifold clustering algorithm) is processed in this
* method.
*
* <PRE>
* The algorithm samples random linear manifolds and tries to find clusters in it.
* It calculates a distance histogram searches for a threshold and partitions the
* points in two groups the ones in the cluster and everything else.
* Then the best fitting linear manifold is searched and registered as a cluster.
* The process is started over until all points are clustered.
* The last cluster should contain all the outliers. (or the whole data if no clusters have been found.)
* For details see {@link LMCLUS}.
* </PRE>
*
* @param database The database to operate on
* @param relation Relation
* @return Clustering result
*/
public Clustering<Model> run(Database database, Relation<NumberVector> relation) {
Clustering<Model> ret = new Clustering<>("LMCLUS Clustering", "lmclus-clustering");
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null;
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null;
ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());
Random r = rnd.getSingleThreadedRandom();
final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation));
int cnum = 0;
while (unclustered.size() > minsize) {
DBIDs current = unclustered;
int lmDim = 1;
for (int k = 1; k <= maxdim; k++) {
// stopping at the appropriate dimensionality either.
while (true) {
Separation separation = findSeparation(relation, current, k, r);
// " threshold: " + separation.threshold);
if (separation.goodness <= sensitivityThreshold) {
break;
}
ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
if (deviation(minusEquals(relation.get(iter).toArray(), separation.originV), separation.basis) < separation.threshold) {
subset.add(iter);
}
}
// logger.verbose("size:"+subset.size());
if (subset.size() < minsize) {
break;
}
current = subset;
lmDim = k;
// System.out.println("Partition: " + subset.size());
}
}
// No more clusters found
if (current.size() < minsize || current == unclustered) {
break;
}
// New cluster found
// TODO: annotate cluster with dimensionality
final Cluster<Model> cluster = new Cluster<>(current);
cluster.setName("Cluster_" + lmDim + "d_" + cnum);
cnum++;
ret.addToplevelCluster(cluster);
// Remove from main working set.
unclustered.removeDBIDs(current);
if (progress != null) {
progress.setProcessed(relation.size() - unclustered.size(), LOG);
}
if (cprogress != null) {
cprogress.setProcessed(cnum, LOG);
}
}
// Remaining objects are noise
if (unclustered.size() > 0) {
ret.addToplevelCluster(new Cluster<>(unclustered, true));
}
if (progress != null) {
progress.setProcessed(relation.size(), LOG);
progress.ensureCompleted(LOG);
}
LOG.setCompleted(cprogress);
return ret;
}
use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.
the class KMeansBisecting method run.
@Override
public Clustering<M> run(Database database, Relation<V> relation) {
ProxyDatabase proxyDB = new ProxyDatabase(relation.getDBIDs(), database);
// Linked list is preferrable for scratch, as we will A) not need that many
// clusters and B) be doing random removals of the largest cluster (often at
// the head)
LinkedList<Cluster<M>> currentClusterList = new LinkedList<>();
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Bisecting k-means", k - 1, LOG) : null;
for (int j = 0; j < this.k - 1; j++) {
// Choose a cluster to split and project database to cluster
if (currentClusterList.isEmpty()) {
proxyDB = new ProxyDatabase(relation.getDBIDs(), database);
} else {
Cluster<M> largestCluster = null;
for (Cluster<M> cluster : currentClusterList) {
if (largestCluster == null || cluster.size() > largestCluster.size()) {
largestCluster = cluster;
}
}
currentClusterList.remove(largestCluster);
proxyDB.setDBIDs(largestCluster.getIDs());
}
// Run the inner k-means algorithm:
// FIXME: ensure we run on the correct relation in a multirelational
// setting!
Clustering<M> innerResult = innerkMeans.run(proxyDB);
// Add resulting clusters to current result.
currentClusterList.addAll(innerResult.getAllClusters());
LOG.incrementProcessed(prog);
if (LOG.isVerbose()) {
LOG.verbose("Iteration " + j);
}
}
LOG.ensureCompleted(prog);
// add all current clusters to the result
Clustering<M> result = new Clustering<>("Bisecting k-Means Result", "Bisecting-k-means");
for (Cluster<M> cluster : currentClusterList) {
result.addToplevelCluster(cluster);
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.
the class DiSH method sortClusters.
/**
* Returns a sorted list of the clusters w.r.t. the subspace dimensionality in
* descending order.
*
* @param relation the database storing the objects
* @param clustersMap the mapping of bits sets to clusters
* @return a sorted list of the clusters
*/
private List<Cluster<SubspaceModel>> sortClusters(Relation<V> relation, Object2ObjectMap<long[], List<ArrayModifiableDBIDs>> clustersMap) {
final int db_dim = RelationUtil.dimensionality(relation);
// int num = 1;
List<Cluster<SubspaceModel>> clusters = new ArrayList<>();
for (long[] pv : clustersMap.keySet()) {
List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv);
for (int i = 0; i < parallelClusters.size(); i++) {
ArrayModifiableDBIDs c = parallelClusters.get(i);
Cluster<SubspaceModel> cluster = new Cluster<>(c);
cluster.setModel(new SubspaceModel(new Subspace(pv), Centroid.make(relation, c).getArrayRef()));
String subspace = BitsUtil.toStringLow(cluster.getModel().getSubspace().getDimensions(), db_dim);
if (parallelClusters.size() > 1) {
cluster.setName("Cluster_" + subspace + "_" + i);
} else {
cluster.setName("Cluster_" + subspace);
}
clusters.add(cluster);
}
}
// sort the clusters w.r.t. lambda
Comparator<Cluster<SubspaceModel>> comparator = new Comparator<Cluster<SubspaceModel>>() {
@Override
public int compare(Cluster<SubspaceModel> c1, Cluster<SubspaceModel> c2) {
return c2.getModel().getSubspace().dimensionality() - c1.getModel().getSubspace().dimensionality();
}
};
Collections.sort(clusters, comparator);
return clusters;
}
Aggregations