use of de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress in project elki by elki-project.
the class LMCLUS method run.
/**
* The main LMCLUS (Linear manifold clustering algorithm) is processed in this
* method.
*
* <PRE>
* The algorithm samples random linear manifolds and tries to find clusters in it.
* It calculates a distance histogram searches for a threshold and partitions the
* points in two groups the ones in the cluster and everything else.
* Then the best fitting linear manifold is searched and registered as a cluster.
* The process is started over until all points are clustered.
* The last cluster should contain all the outliers. (or the whole data if no clusters have been found.)
* For details see {@link LMCLUS}.
* </PRE>
*
* @param database The database to operate on
* @param relation Relation
* @return Clustering result
*/
public Clustering<Model> run(Database database, Relation<NumberVector> relation) {
Clustering<Model> ret = new Clustering<>("LMCLUS Clustering", "lmclus-clustering");
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null;
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null;
ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());
Random r = rnd.getSingleThreadedRandom();
final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation));
int cnum = 0;
while (unclustered.size() > minsize) {
DBIDs current = unclustered;
int lmDim = 1;
for (int k = 1; k <= maxdim; k++) {
// stopping at the appropriate dimensionality either.
while (true) {
Separation separation = findSeparation(relation, current, k, r);
// " threshold: " + separation.threshold);
if (separation.goodness <= sensitivityThreshold) {
break;
}
ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
if (deviation(minusEquals(relation.get(iter).toArray(), separation.originV), separation.basis) < separation.threshold) {
subset.add(iter);
}
}
// logger.verbose("size:"+subset.size());
if (subset.size() < minsize) {
break;
}
current = subset;
lmDim = k;
// System.out.println("Partition: " + subset.size());
}
}
// No more clusters found
if (current.size() < minsize || current == unclustered) {
break;
}
// New cluster found
// TODO: annotate cluster with dimensionality
final Cluster<Model> cluster = new Cluster<>(current);
cluster.setName("Cluster_" + lmDim + "d_" + cnum);
cnum++;
ret.addToplevelCluster(cluster);
// Remove from main working set.
unclustered.removeDBIDs(current);
if (progress != null) {
progress.setProcessed(relation.size() - unclustered.size(), LOG);
}
if (cprogress != null) {
cprogress.setProcessed(cnum, LOG);
}
}
// Remaining objects are noise
if (unclustered.size() > 0) {
ret.addToplevelCluster(new Cluster<>(unclustered, true));
}
if (progress != null) {
progress.setProcessed(relation.size(), LOG);
progress.ensureCompleted(LOG);
}
LOG.setCompleted(cprogress);
return ret;
}
use of de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress in project elki by elki-project.
the class ChengAndChurch method biclustering.
@Override
public Clustering<BiclusterWithInversionsModel> biclustering() {
double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs);
BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim());
Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering");
ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs());
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null;
for (int i = 0; i < n; i++) {
cand.reset();
multipleNodeDeletion(mat, cand);
if (LOG.isVeryVerbose()) {
LOG.veryverbose("Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
}
singleNodeDeletion(mat, cand);
if (LOG.isVeryVerbose()) {
LOG.veryverbose("Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
}
nodeAddition(mat, cand);
if (LOG.isVeryVerbose()) {
LOG.veryverbose("Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
}
cand.maskMatrix(mat, dist);
BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow));
final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows);
noise.removeDBIDs(cids);
result.addToplevelCluster(new Cluster<>(cids, model));
if (LOG.isVerbose()) {
LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n");
LOG.verbose("Number of rows: " + cand.rowcard + "\n");
LOG.verbose("Number of columns: " + cand.colcard + "\n");
// LOG.verbose("Total number of masked values: " + maskedVals.size() +
// "\n");
}
LOG.incrementProcessed(prog);
}
// Add a noise cluster, full-dimensional.
if (!noise.isEmpty()) {
long[] allcols = BitsUtil.ones(getColDim());
BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS);
result.addToplevelCluster(new Cluster<>(noise, true, model));
}
LOG.ensureCompleted(prog);
return result;
}
use of de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress in project elki by elki-project.
the class KMeansBisecting method run.
@Override
public Clustering<M> run(Database database, Relation<V> relation) {
ProxyDatabase proxyDB = new ProxyDatabase(relation.getDBIDs(), database);
// Linked list is preferrable for scratch, as we will A) not need that many
// clusters and B) be doing random removals of the largest cluster (often at
// the head)
LinkedList<Cluster<M>> currentClusterList = new LinkedList<>();
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Bisecting k-means", k - 1, LOG) : null;
for (int j = 0; j < this.k - 1; j++) {
// Choose a cluster to split and project database to cluster
if (currentClusterList.isEmpty()) {
proxyDB = new ProxyDatabase(relation.getDBIDs(), database);
} else {
Cluster<M> largestCluster = null;
for (Cluster<M> cluster : currentClusterList) {
if (largestCluster == null || cluster.size() > largestCluster.size()) {
largestCluster = cluster;
}
}
currentClusterList.remove(largestCluster);
proxyDB.setDBIDs(largestCluster.getIDs());
}
// Run the inner k-means algorithm:
// FIXME: ensure we run on the correct relation in a multirelational
// setting!
Clustering<M> innerResult = innerkMeans.run(proxyDB);
// Add resulting clusters to current result.
currentClusterList.addAll(innerResult.getAllClusters());
LOG.incrementProcessed(prog);
if (LOG.isVerbose()) {
LOG.verbose("Iteration " + j);
}
}
LOG.ensureCompleted(prog);
// add all current clusters to the result
Clustering<M> result = new Clustering<>("Bisecting k-Means Result", "Bisecting-k-means");
for (Cluster<M> cluster : currentClusterList) {
result.addToplevelCluster(cluster);
}
return result;
}
use of de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress in project elki by elki-project.
the class PartitionApproximationMaterializeKNNPreprocessor method preprocess.
@Override
protected void preprocess() {
DistanceQuery<O> distanceQuery = relation.getDistanceQuery(distanceFunction);
storage = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, KNNList.class);
MeanVariance ksize = new MeanVariance();
if (LOG.isVerbose()) {
LOG.verbose("Approximating nearest neighbor lists to database objects");
}
// Produce a random shuffling of the IDs:
ArrayDBIDs[] parts = DBIDUtil.randomSplit(relation.getDBIDs(), partitions, rnd);
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Processing partitions", partitions, LOG) : null;
for (int part = 0; part < partitions; part++) {
final ArrayDBIDs ids = parts[part];
final int size = ids.size();
Object2DoubleOpenHashMap<DBIDPair> cache = new Object2DoubleOpenHashMap<>((size * size * 3) >> 3);
cache.defaultReturnValue(Double.NaN);
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
KNNHeap kNN = DBIDUtil.newHeap(k);
for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
DBIDPair key = DBIDUtil.newPair(iter, iter2);
double d = cache.removeDouble(key);
if (d == d) {
// Not NaN
// consume the previous result.
kNN.insert(d, iter2);
} else {
// compute new and store the previous result.
d = distanceQuery.distance(iter, iter2);
kNN.insert(d, iter2);
// put it into the cache, but with the keys reversed
key = DBIDUtil.newPair(iter2, iter);
cache.put(key, d);
}
}
ksize.put(kNN.size());
storage.put(iter, kNN.toKNNList());
}
if (LOG.isDebugging() && cache.size() > 0) {
LOG.warning("Cache should be empty after each run, but still has " + cache.size() + " elements.");
}
LOG.incrementProcessed(progress);
}
LOG.ensureCompleted(progress);
if (LOG.isVerbose()) {
LOG.verbose("On average, " + ksize.getMean() + " +- " + ksize.getSampleStddev() + " neighbors returned.");
}
}
use of de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress in project elki by elki-project.
the class RandomSampleKNNPreprocessor method preprocess.
@Override
protected void preprocess() {
DistanceQuery<O> distanceQuery = relation.getDistanceQuery(distanceFunction);
storage = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, KNNList.class);
FiniteProgress progress = getLogger().isVerbose() ? new FiniteProgress("Materializing random-sample k nearest neighbors (k=" + k + ")", relation.size(), getLogger()) : null;
final ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
final int samplesize = (int) (ids.size() * share);
Random random = rnd.getSingleThreadedRandom();
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
KNNHeap kNN = DBIDUtil.newHeap(k);
DBIDs rsamp = DBIDUtil.randomSample(ids, samplesize, random);
for (DBIDIter iter2 = rsamp.iter(); iter2.valid(); iter2.advance()) {
double dist = distanceQuery.distance(iter, iter2);
kNN.insert(dist, iter2);
}
storage.put(iter, kNN.toKNNList());
getLogger().incrementProcessed(progress);
}
getLogger().ensureCompleted(progress);
}
Aggregations