use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class ERiC method run.
/**
* Performs the ERiC algorithm on the given database.
*
* @param relation Relation to process
* @return Clustering result
*/
public Clustering<CorrelationModel> run(Database database, Relation<V> relation) {
final int dimensionality = RelationUtil.dimensionality(relation);
StepProgress stepprog = LOG.isVerbose() ? new StepProgress(3) : null;
// Run Generalized DBSCAN
LOG.beginStep(stepprog, 1, "Preprocessing local correlation dimensionalities and partitioning data");
// FIXME: how to ensure we are running on the same relation?
ERiCNeighborPredicate<V>.Instance npred = new ERiCNeighborPredicate<V>(settings).instantiate(database, relation);
CorePredicate.Instance<DBIDs> cpred = new MinPtsCorePredicate(settings.minpts).instantiate(database);
Clustering<Model> copacResult = new GeneralizedDBSCAN.Instance<>(npred, cpred, false).run();
// extract correlation clusters
LOG.beginStep(stepprog, 2, "Extract correlation clusters");
List<List<Cluster<CorrelationModel>>> clusterMap = extractCorrelationClusters(copacResult, relation, dimensionality, npred);
if (LOG.isDebugging()) {
StringBuilder msg = new StringBuilder("Step 2: Extract correlation clusters...");
for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(corrDim);
msg.append("\n\ncorrDim ").append(corrDim);
for (Cluster<CorrelationModel> cluster : correlationClusters) {
msg.append("\n cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
// .append(", level: ").append(cluster.getLevel()).append(", index:
// ").append(cluster.getLevelIndex());
// msg.append("\n basis " +
// cluster.getPCA().getWeakEigenvectors().toString(" ", NF) +
// " ids " + cluster.getIDs().size());
}
}
LOG.debugFine(msg.toString());
}
if (LOG.isVerbose()) {
int clusters = 0;
for (List<Cluster<CorrelationModel>> correlationClusters : clusterMap) {
clusters += correlationClusters.size();
}
LOG.verbose(clusters + " clusters extracted.");
}
// build hierarchy
LOG.beginStep(stepprog, 3, "Building hierarchy");
Clustering<CorrelationModel> clustering = new Clustering<>("ERiC clustering", "eric-clustering");
buildHierarchy(clustering, clusterMap, npred);
if (LOG.isDebugging()) {
StringBuilder msg = new StringBuilder("Step 3: Build hierarchy");
for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(corrDim);
for (Cluster<CorrelationModel> cluster : correlationClusters) {
msg.append("\n cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
// ").append(cluster.getLevelIndex());
for (It<Cluster<CorrelationModel>> iter = clustering.getClusterHierarchy().iterParents(cluster); iter.valid(); iter.advance()) {
msg.append("\n parent ").append(iter.get());
}
for (It<Cluster<CorrelationModel>> iter = clustering.getClusterHierarchy().iterChildren(cluster); iter.valid(); iter.advance()) {
msg.append("\n child ").append(iter.get());
}
}
}
LOG.debugFine(msg.toString());
}
LOG.setCompleted(stepprog);
for (Cluster<CorrelationModel> rc : clusterMap.get(clusterMap.size() - 1)) {
clustering.addToplevelCluster(rc);
}
return clustering;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class CanopyPreClustering method run.
/**
* Run the algorithm
*
* @param database Database
* @param relation Relation to process
*/
public Clustering<PrototypeModel<O>> run(Database database, Relation<O> relation) {
if (!(t1 >= t2)) {
throw new AbortException("T1 must be at least as large as T2.");
}
DistanceQuery<O> dq = database.getDistanceQuery(relation, getDistanceFunction());
ModifiableDBIDs ids = DBIDUtil.newHashSet(relation.getDBIDs());
ArrayList<Cluster<PrototypeModel<O>>> clusters = new ArrayList<>();
final int size = relation.size();
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Canopy clustering", size, LOG) : null;
DBIDVar first = DBIDUtil.newVar();
while (!ids.isEmpty()) {
// Remove first element:
ids.pop(first);
// Start a new cluster:
ModifiableDBIDs cids = DBIDUtil.newArray();
cids.add(first);
// Compare to remaining objects:
for (DBIDMIter iter = ids.iter(); iter.valid(); iter.advance()) {
double dist = dq.distance(first, iter);
// Inclusion threshold:
if (dist > t1) {
continue;
}
cids.add(iter);
// Removal threshold:
if (dist <= t2) {
iter.remove();
}
}
// TODO: remember the central object using a CanopyModel?
// Construct cluster:
clusters.add(new Cluster<>(cids, new SimplePrototypeModel<>(relation.get(first))));
if (prog != null) {
prog.setProcessed(size - ids.size(), LOG);
}
}
LOG.ensureCompleted(prog);
return new Clustering<>("Canopy clustering", "canopy-clustering", clusters);
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class AffinityPropagationClusteringAlgorithm method run.
/**
* Perform affinity propagation clustering.
*
* @param db Database
* @param relation Relation
* @return Clustering result
*/
public Clustering<MedoidModel> run(Database db, Relation<O> relation) {
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
final int size = ids.size();
int[] assignment = new int[size];
double[][] s = initialization.getSimilarityMatrix(db, relation, ids);
double[][] r = new double[size][size];
double[][] a = new double[size][size];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("Affinity Propagation Iteration", LOG) : null;
MutableProgress aprog = LOG.isVerbose() ? new MutableProgress("Stable assignments", size + 1, LOG) : null;
int inactive = 0;
for (int iteration = 0; iteration < maxiter && inactive < convergence; iteration++) {
// Update responsibility matrix:
for (int i = 0; i < size; i++) {
double[] ai = a[i], ri = r[i], si = s[i];
// Find the two largest values (as initially maxk == i)
double max1 = Double.NEGATIVE_INFINITY, max2 = Double.NEGATIVE_INFINITY;
int maxk = -1;
for (int k = 0; k < size; k++) {
double val = ai[k] + si[k];
if (val > max1) {
max2 = max1;
max1 = val;
maxk = k;
} else if (val > max2) {
max2 = val;
}
}
// With the maximum value known, update r:
for (int k = 0; k < size; k++) {
double val = si[k] - ((k != maxk) ? max1 : max2);
ri[k] = ri[k] * lambda + val * (1. - lambda);
}
}
// Update availability matrix
for (int k = 0; k < size; k++) {
// Compute sum of max(0, r_ik) for all i.
// For r_kk, don't apply the max.
double colposum = 0.;
for (int i = 0; i < size; i++) {
if (i == k || r[i][k] > 0.) {
colposum += r[i][k];
}
}
for (int i = 0; i < size; i++) {
double val = colposum;
// Adjust column sum by the one extra term.
if (i == k || r[i][k] > 0.) {
val -= r[i][k];
}
if (i != k && val > 0.) {
// min
val = 0.;
}
a[i][k] = a[i][k] * lambda + val * (1 - lambda);
}
}
int changed = 0;
for (int i = 0; i < size; i++) {
double[] ai = a[i], ri = r[i];
double max = Double.NEGATIVE_INFINITY;
int maxj = -1;
for (int j = 0; j < size; j++) {
double v = ai[j] + ri[j];
if (v > max || (i == j && v >= max)) {
max = v;
maxj = j;
}
}
if (assignment[i] != maxj) {
changed += 1;
assignment[i] = maxj;
}
}
inactive = (changed > 0) ? 0 : (inactive + 1);
LOG.incrementProcessed(prog);
if (aprog != null) {
aprog.setProcessed(size - changed, LOG);
}
}
if (aprog != null) {
aprog.setProcessed(aprog.getTotal(), LOG);
}
LOG.setCompleted(prog);
// Cluster map, by lead object
Int2ObjectOpenHashMap<ModifiableDBIDs> map = new Int2ObjectOpenHashMap<>();
DBIDArrayIter i1 = ids.iter();
for (int i = 0; i1.valid(); i1.advance(), i++) {
int c = assignment[i];
// Add to cluster members:
ModifiableDBIDs cids = map.get(c);
if (cids == null) {
cids = DBIDUtil.newArray();
map.put(c, cids);
}
cids.add(i1);
}
// If we stopped early, the cluster lead might be in a different cluster.
for (ObjectIterator<Int2ObjectOpenHashMap.Entry<ModifiableDBIDs>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
Int2ObjectOpenHashMap.Entry<ModifiableDBIDs> entry = iter.next();
final int key = entry.getIntKey();
int targetkey = key;
ModifiableDBIDs tids = null;
// Chase arrows:
while (ids == null && assignment[targetkey] != targetkey) {
targetkey = assignment[targetkey];
tids = map.get(targetkey);
}
if (tids != null && targetkey != key) {
tids.addDBIDs(entry.getValue());
iter.remove();
}
}
Clustering<MedoidModel> clustering = new Clustering<>("Affinity Propagation Clustering", "ap-clustering");
ModifiableDBIDs noise = DBIDUtil.newArray();
for (ObjectIterator<Int2ObjectOpenHashMap.Entry<ModifiableDBIDs>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
Int2ObjectOpenHashMap.Entry<ModifiableDBIDs> entry = iter.next();
i1.seek(entry.getIntKey());
if (entry.getValue().size() > 1) {
MedoidModel mod = new MedoidModel(DBIDUtil.deref(i1));
clustering.addToplevelCluster(new Cluster<>(entry.getValue(), mod));
} else {
noise.add(i1);
}
}
if (noise.size() > 0) {
MedoidModel mod = new MedoidModel(DBIDUtil.deref(noise.iter()));
clustering.addToplevelCluster(new Cluster<>(noise, true, mod));
}
return clustering;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class NaiveAgglomerativeHierarchicalClustering3 method run.
/**
* Run the algorithm
*
* @param db Database
* @param relation Relation
* @return Clustering hierarchy
*/
public Result run(Database db, Relation<O> relation) {
DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
final int size = ids.size();
if (size > 0x10000) {
throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow.");
}
if (Linkage.SINGLE.equals(linkage)) {
LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
}
// Compute the initial (lower triangular) distance matrix.
double[] scratch = new double[triangleSize(size)];
DBIDArrayIter ix = ids.iter(), iy = ids.iter();
// Position counter - must agree with computeOffset!
int pos = 0;
boolean square = Linkage.WARD.equals(linkage) && !getDistanceFunction().isSquared();
for (int x = 0; ix.valid(); x++, ix.advance()) {
iy.seek(0);
for (int y = 0; y < x; y++, iy.advance()) {
scratch[pos] = dq.distance(ix, iy);
// Ward uses variances -- i.e. squared values
if (square) {
scratch[pos] *= scratch[pos];
}
pos++;
}
}
// Initialize space for result:
double[] height = new double[size];
Arrays.fill(height, Double.POSITIVE_INFINITY);
// Parent node, to track merges
// have every object point to itself initially
ArrayModifiableDBIDs parent = DBIDUtil.newArray(ids);
// Active clusters, when not trivial.
Int2ReferenceMap<ModifiableDBIDs> clusters = new Int2ReferenceOpenHashMap<>();
// Repeat until everything merged, except the desired number of clusters:
final int stop = size - numclusters;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", stop, LOG) : null;
for (int i = 0; i < stop; i++) {
double min = Double.POSITIVE_INFINITY;
int minx = -1, miny = -1;
for (int x = 0; x < size; x++) {
if (height[x] < Double.POSITIVE_INFINITY) {
continue;
}
final int xbase = triangleSize(x);
for (int y = 0; y < x; y++) {
if (height[y] < Double.POSITIVE_INFINITY) {
continue;
}
final int idx = xbase + y;
if (scratch[idx] < min) {
min = scratch[idx];
minx = x;
miny = y;
}
}
}
assert (minx >= 0 && miny >= 0);
// Avoid allocating memory, by reusing existing iterators:
ix.seek(minx);
iy.seek(miny);
// Perform merge in data structure: x -> y
// Since y < x, prefer keeping y, dropping x.
height[minx] = min;
parent.set(minx, iy);
// Merge into cluster
ModifiableDBIDs cx = clusters.get(minx);
ModifiableDBIDs cy = clusters.get(miny);
// cluster sizes, for averaging
int sizex = 1, sizey = 1;
if (cy == null) {
cy = DBIDUtil.newHashSet();
cy.add(iy);
} else {
sizey = cy.size();
}
if (cx == null) {
cy.add(ix);
} else {
sizex = cx.size();
cy.addDBIDs(cx);
clusters.remove(minx);
}
clusters.put(miny, cy);
// Update distance matrix. Note: miny < minx
// Implementation note: most will not need sizej, and could save the
// hashmap lookup.
final int xbase = triangleSize(minx), ybase = triangleSize(miny);
// Write to (y, j), with j < y
for (int j = 0; j < miny; j++) {
if (height[j] < Double.POSITIVE_INFINITY) {
continue;
}
final DBIDs idsj = clusters.get(j);
final int sizej = (idsj == null) ? 1 : idsj.size();
scratch[ybase + j] = linkage.combine(sizex, scratch[xbase + j], sizey, scratch[ybase + j], sizej, min);
}
// Write to (j, y), with y < j < x
for (int j = miny + 1; j < minx; j++) {
if (height[j] < Double.POSITIVE_INFINITY) {
continue;
}
final int jbase = triangleSize(j);
final DBIDs idsj = clusters.get(j);
final int sizej = (idsj == null) ? 1 : idsj.size();
scratch[jbase + miny] = linkage.combine(sizex, scratch[xbase + j], sizey, scratch[jbase + miny], sizej, min);
}
// Write to (j, y), with y < x < j
for (int j = minx + 1; j < size; j++) {
if (height[j] < Double.POSITIVE_INFINITY) {
continue;
}
final DBIDs idsj = clusters.get(j);
final int sizej = (idsj == null) ? 1 : idsj.size();
final int jbase = triangleSize(j);
scratch[jbase + miny] = linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min);
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
// Build the clustering result
final Clustering<Model> dendrogram = new Clustering<>("Hierarchical-Clustering", "hierarchical-clustering");
for (int x = 0; x < size; x++) {
if (height[x] < Double.POSITIVE_INFINITY) {
DBIDs cids = clusters.get(x);
if (cids == null) {
ix.seek(x);
cids = DBIDUtil.deref(ix);
}
Cluster<Model> cluster = new Cluster<>("Cluster", cids);
dendrogram.addToplevelCluster(cluster);
}
}
return dendrogram;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class UKMeans method run.
/**
* Run the clustering.
*
* @param database the Database
* @param relation the Relation
* @return Clustering result
*/
public Clustering<?> run(final Database database, final Relation<DiscreteUncertainObject> relation) {
if (relation.size() <= 0) {
return new Clustering<>("Uk-Means Clustering", "ukmeans-clustering");
}
// Choose initial means randomly
DBIDs sampleids = DBIDUtil.randomSample(relation.getDBIDs(), k, rnd);
List<double[]> means = new ArrayList<>(k);
for (DBIDIter iter = sampleids.iter(); iter.valid(); iter.advance()) {
means.add(ArrayLikeUtil.toPrimitiveDoubleArray(relation.get(iter).getCenterOfMass()));
}
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
double[] varsum = new double[k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("UK-Means iteration", LOG) : null;
DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
int iteration = 0;
for (; maxiter <= 0 || iteration < maxiter; iteration++) {
LOG.incrementProcessed(prog);
boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum);
logVarstat(varstat, varsum);
// Stop if no cluster assignment changed.
if (!changed) {
break;
}
// Recompute means.
means = means(clusters, means, relation);
}
LOG.setCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
}
// Wrap result
Clustering<KMeansModel> result = new Clustering<>("Uk-Means Clustering", "ukmeans-clustering");
for (int i = 0; i < clusters.size(); i++) {
DBIDs ids = clusters.get(i);
if (ids.isEmpty()) {
continue;
}
result.addToplevelCluster(new Cluster<>(ids, new KMeansModel(means.get(i), varsum[i])));
}
return result;
}
Aggregations