use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class LMCLUS method run.
/**
* The main LMCLUS (Linear manifold clustering algorithm) is processed in this
* method.
*
* <PRE>
* The algorithm samples random linear manifolds and tries to find clusters in it.
* It calculates a distance histogram searches for a threshold and partitions the
* points in two groups the ones in the cluster and everything else.
* Then the best fitting linear manifold is searched and registered as a cluster.
* The process is started over until all points are clustered.
* The last cluster should contain all the outliers. (or the whole data if no clusters have been found.)
* For details see {@link LMCLUS}.
* </PRE>
*
* @param database The database to operate on
* @param relation Relation
* @return Clustering result
*/
public Clustering<Model> run(Database database, Relation<NumberVector> relation) {
Clustering<Model> ret = new Clustering<>("LMCLUS Clustering", "lmclus-clustering");
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null;
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null;
ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());
Random r = rnd.getSingleThreadedRandom();
final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation));
int cnum = 0;
while (unclustered.size() > minsize) {
DBIDs current = unclustered;
int lmDim = 1;
for (int k = 1; k <= maxdim; k++) {
// stopping at the appropriate dimensionality either.
while (true) {
Separation separation = findSeparation(relation, current, k, r);
// " threshold: " + separation.threshold);
if (separation.goodness <= sensitivityThreshold) {
break;
}
ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
if (deviation(minusEquals(relation.get(iter).toArray(), separation.originV), separation.basis) < separation.threshold) {
subset.add(iter);
}
}
// logger.verbose("size:"+subset.size());
if (subset.size() < minsize) {
break;
}
current = subset;
lmDim = k;
// System.out.println("Partition: " + subset.size());
}
}
// No more clusters found
if (current.size() < minsize || current == unclustered) {
break;
}
// New cluster found
// TODO: annotate cluster with dimensionality
final Cluster<Model> cluster = new Cluster<>(current);
cluster.setName("Cluster_" + lmDim + "d_" + cnum);
cnum++;
ret.addToplevelCluster(cluster);
// Remove from main working set.
unclustered.removeDBIDs(current);
if (progress != null) {
progress.setProcessed(relation.size() - unclustered.size(), LOG);
}
if (cprogress != null) {
cprogress.setProcessed(cnum, LOG);
}
}
// Remaining objects are noise
if (unclustered.size() > 0) {
ret.addToplevelCluster(new Cluster<>(unclustered, true));
}
if (progress != null) {
progress.setProcessed(relation.size(), LOG);
progress.ensureCompleted(LOG);
}
LOG.setCompleted(cprogress);
return ret;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class EM method run.
/**
* Performs the EM clustering algorithm on the given database.
*
* Finally a hard clustering is provided where each clusters gets assigned the
* points exhibiting the highest probability to belong to this cluster. But
* still, the database objects hold associated the complete probability-vector
* for all models.
*
* @param database Database
* @param relation Relation
* @return Result
*/
public Clustering<M> run(Database database, Relation<V> relation) {
if (relation.size() == 0) {
throw new IllegalArgumentException("database empty: must contain elements");
}
// initial models
List<? extends EMClusterModel<M>> models = mfactory.buildInitialModels(database, relation, k, SquaredEuclideanDistanceFunction.STATIC);
WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
double loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
DoubleStatistic likestat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".loglikelihood") : null;
if (LOG.isStatistics()) {
LOG.statistics(likestat.setDouble(loglikelihood));
}
// iteration unless no change
int it = 0, lastimprovement = 0;
// For detecting instabilities.
double bestloglikelihood = loglikelihood;
for (++it; it < maxiter || maxiter < 0; it++) {
final double oldloglikelihood = loglikelihood;
recomputeCovarianceMatrices(relation, probClusterIGivenX, models, prior);
// reassign probabilities
loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
if (LOG.isStatistics()) {
LOG.statistics(likestat.setDouble(loglikelihood));
}
if (loglikelihood - bestloglikelihood > delta) {
lastimprovement = it;
bestloglikelihood = loglikelihood;
}
if (Math.abs(loglikelihood - oldloglikelihood) <= delta || lastimprovement < it >> 1) {
break;
}
}
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", it));
}
// fill result with clusters and models
List<ModifiableDBIDs> hardClusters = new ArrayList<>(k);
for (int i = 0; i < k; i++) {
hardClusters.add(DBIDUtil.newArray());
}
// provide a hard clustering
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
hardClusters.get(argmax(probClusterIGivenX.get(iditer))).add(iditer);
}
Clustering<M> result = new Clustering<>("EM Clustering", "em-clustering");
// provide models within the result
for (int i = 0; i < k; i++) {
result.addToplevelCluster(new Cluster<>(hardClusters.get(i), models.get(i).finalizeCluster()));
}
if (isSoft()) {
result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs()));
} else {
probClusterIGivenX.destroy();
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class KMedoidsEM method run.
/**
* Run k-medoids
*
* @param database Database
* @param relation relation to use
* @return result
*/
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
}
DistanceQuery<V> distQ = null;
// Only enforce a distance matrix for PAM initialization, which is slow.
if (initializer instanceof PAMInitialMeans) {
distQ = DatabaseUtil.precomputedDistanceQuery(database, relation, getDistanceFunction(), LOG);
} else {
distQ = database.getDistanceQuery(relation, getDistanceFunction());
}
// Choose initial medoids
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, relation.getDBIDs(), distQ));
DBIDArrayMIter miter = medoids.iter();
double[] mdists = new double[k];
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
HashSetModifiableDBIDs set = DBIDUtil.newHashSet(relation.size() / k);
// Add medoids.
set.add(miter.seek(i));
clusters.add(set);
}
// Initial assignment to nearest medoids
// TODO: reuse this information, from the build phase, when possible?
double tc = assignToNearestCluster(miter, mdists, clusters, distQ);
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + 0 + ".cost", tc));
}
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids EM iteration", LOG) : null;
// Swap phase
int iteration = 0;
DBIDVar best = DBIDUtil.newVar();
while (true) {
boolean changed = false;
// Try to swap the medoid with a better cluster member:
int i = 0;
for (miter.seek(0); miter.valid(); miter.advance(), i++) {
best.unset();
double bestm = mdists[i];
for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
if (DBIDUtil.equal(miter, iter)) {
continue;
}
double sum = 0;
for (DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
sum += distQ.distance(iter, iter2);
}
if (sum < bestm) {
best.set(iter);
bestm = sum;
}
}
if (best.isSet() && !DBIDUtil.equal(miter, best)) {
changed = true;
assert (clusters.get(i).contains(best));
medoids.set(i, best);
mdists[i] = bestm;
}
}
// Reassign
if (!changed) {
break;
}
double nc = assignToNearestCluster(miter, mdists, clusters, distQ);
++iteration;
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", nc));
}
LOG.incrementProcessed(prog);
}
LOG.setCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
}
// Wrap result
Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
for (DBIDArrayIter it = medoids.iter(); it.valid(); it.advance()) {
result.addToplevelCluster(new Cluster<>(clusters.get(it.getOffset()), new MedoidModel(DBIDUtil.deref(it))));
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class ChengAndChurch method biclustering.
@Override
public Clustering<BiclusterWithInversionsModel> biclustering() {
double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs);
BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim());
Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering");
ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs());
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null;
for (int i = 0; i < n; i++) {
cand.reset();
multipleNodeDeletion(mat, cand);
if (LOG.isVeryVerbose()) {
LOG.veryverbose("Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
}
singleNodeDeletion(mat, cand);
if (LOG.isVeryVerbose()) {
LOG.veryverbose("Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
}
nodeAddition(mat, cand);
if (LOG.isVeryVerbose()) {
LOG.veryverbose("Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
}
cand.maskMatrix(mat, dist);
BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow));
final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows);
noise.removeDBIDs(cids);
result.addToplevelCluster(new Cluster<>(cids, model));
if (LOG.isVerbose()) {
LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n");
LOG.verbose("Number of rows: " + cand.rowcard + "\n");
LOG.verbose("Number of columns: " + cand.colcard + "\n");
// LOG.verbose("Total number of masked values: " + maskedVals.size() +
// "\n");
}
LOG.incrementProcessed(prog);
}
// Add a noise cluster, full-dimensional.
if (!noise.isEmpty()) {
long[] allcols = BitsUtil.ones(getColDim());
BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS);
result.addToplevelCluster(new Cluster<>(noise, true, model));
}
LOG.ensureCompleted(prog);
return result;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class KMeansBisecting method run.
@Override
public Clustering<M> run(Database database, Relation<V> relation) {
ProxyDatabase proxyDB = new ProxyDatabase(relation.getDBIDs(), database);
// Linked list is preferrable for scratch, as we will A) not need that many
// clusters and B) be doing random removals of the largest cluster (often at
// the head)
LinkedList<Cluster<M>> currentClusterList = new LinkedList<>();
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Bisecting k-means", k - 1, LOG) : null;
for (int j = 0; j < this.k - 1; j++) {
// Choose a cluster to split and project database to cluster
if (currentClusterList.isEmpty()) {
proxyDB = new ProxyDatabase(relation.getDBIDs(), database);
} else {
Cluster<M> largestCluster = null;
for (Cluster<M> cluster : currentClusterList) {
if (largestCluster == null || cluster.size() > largestCluster.size()) {
largestCluster = cluster;
}
}
currentClusterList.remove(largestCluster);
proxyDB.setDBIDs(largestCluster.getIDs());
}
// Run the inner k-means algorithm:
// FIXME: ensure we run on the correct relation in a multirelational
// setting!
Clustering<M> innerResult = innerkMeans.run(proxyDB);
// Add resulting clusters to current result.
currentClusterList.addAll(innerResult.getAllClusters());
LOG.incrementProcessed(prog);
if (LOG.isVerbose()) {
LOG.verbose("Iteration " + j);
}
}
LOG.ensureCompleted(prog);
// add all current clusters to the result
Clustering<M> result = new Clustering<>("Bisecting k-Means Result", "Bisecting-k-means");
for (Cluster<M> cluster : currentClusterList) {
result.addToplevelCluster(cluster);
}
return result;
}
Aggregations