use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class OPTICSCut method makeOPTICSCut.
/**
* Compute an OPTICS cut clustering
*
* @param co Cluster order result
* @param epsilon Epsilon value for cut
* @return New partitioning clustering
*/
public static <E extends ClusterOrder> Clustering<Model> makeOPTICSCut(E co, double epsilon) {
// Clustering model we are building
Clustering<Model> clustering = new Clustering<>("OPTICS Cut Clustering", "optics-cut");
// Collects noise elements
ModifiableDBIDs noise = DBIDUtil.newHashSet();
double lastDist = Double.MAX_VALUE;
double actDist = Double.MAX_VALUE;
// Current working set
ModifiableDBIDs current = DBIDUtil.newHashSet();
// TODO: can we implement this more nicely with a 1-lookahead?
DBIDVar prev = DBIDUtil.newVar();
for (DBIDIter it = co.iter(); it.valid(); prev.set(it), it.advance()) {
lastDist = actDist;
actDist = co.getReachability(it);
if (actDist <= epsilon) {
// the last element before the plot drops belongs to the cluster
if (lastDist > epsilon && prev.isSet()) {
// So un-noise it
noise.remove(prev);
// Add it to the cluster
current.add(prev);
}
current.add(it);
} else {
// 'Finish' the previous cluster
if (!current.isEmpty()) {
// TODO: do we want a minpts restriction?
// But we get have only core points guaranteed anyway.
clustering.addToplevelCluster(new Cluster<Model>(current, ClusterModel.CLUSTER));
current = DBIDUtil.newHashSet();
}
// Add to noise
noise.add(it);
}
}
// Any unfinished cluster will also be added
if (!current.isEmpty()) {
clustering.addToplevelCluster(new Cluster<Model>(current, ClusterModel.CLUSTER));
}
// Add noise
clustering.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
return clustering;
}
use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class SLINKHDBSCANLinearMemory method run.
/**
* Run the algorithm
*
* @param db Database
* @param relation Relation
* @return Clustering hierarchy
*/
public PointerDensityHierarchyRepresentationResult run(Database db, Relation<O> relation) {
final DistanceQuery<O> distQ = db.getDistanceQuery(relation, getDistanceFunction());
final KNNQuery<O> knnQ = db.getKNNQuery(distQ, minPts);
// We need array addressing later.
final ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
// Compute the core distances
// minPts + 1: ignore query point.
final WritableDoubleDataStore coredists = computeCoreDists(ids, knnQ, minPts);
WritableDBIDDataStore pi = DataStoreUtil.makeDBIDStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
WritableDoubleDataStore lambda = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, Double.POSITIVE_INFINITY);
// Temporary storage for m.
WritableDoubleDataStore m = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Running HDBSCAN*-SLINK", ids.size(), LOG) : null;
// has to be an array for monotonicity reasons!
ModifiableDBIDs processedIDs = DBIDUtil.newArray(ids.size());
for (DBIDIter id = ids.iter(); id.valid(); id.advance()) {
// Steps 1,3,4 are exactly as in SLINK
step1(id, pi, lambda);
// Step 2 is modified to use a different distance
step2(id, processedIDs, distQ, coredists, m);
step3(id, pi, lambda, processedIDs, m);
step4(id, pi, lambda, processedIDs);
processedIDs.add(id);
LOG.incrementProcessed(progress);
}
LOG.ensureCompleted(progress);
return new PointerDensityHierarchyRepresentationResult(ids, pi, lambda, distQ.getDistanceFunction().isSquared(), coredists);
}
use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class BIRCHLeafClustering method run.
/**
* Run the clustering algorithm.
*
* @param relation Input data
* @return Clustering
*/
public Clustering<MeanModel> run(Relation<NumberVector> relation) {
final int dim = RelationUtil.dimensionality(relation);
CFTree tree = cffactory.newTree(relation.getDBIDs(), relation);
// The CFTree does not store points. We have to reassign them (and the
// quality is better than if we used the initial assignment, because centers
// move in particular in the beginning, so we always had many outliers.
Map<ClusteringFeature, ModifiableDBIDs> idmap = new HashMap<ClusteringFeature, ModifiableDBIDs>(tree.leaves);
for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
ClusteringFeature cf = tree.findLeaf(relation.get(iter));
ModifiableDBIDs ids = idmap.get(cf);
if (ids == null) {
idmap.put(cf, ids = DBIDUtil.newArray(cf.n));
}
ids.add(iter);
}
Clustering<MeanModel> result = new Clustering<>("BIRCH-leaves", "BIRCH leaves");
for (Map.Entry<ClusteringFeature, ModifiableDBIDs> ent : idmap.entrySet()) {
ClusteringFeature leaf = ent.getKey();
double[] center = new double[dim];
for (int i = 0; i < dim; i++) {
center[i] = leaf.centroid(i);
}
result.addToplevelCluster(new Cluster<>(ent.getValue(), new MeanModel(center)));
}
return result;
}
use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class LMCLUS method run.
/**
* The main LMCLUS (Linear manifold clustering algorithm) is processed in this
* method.
*
* <PRE>
* The algorithm samples random linear manifolds and tries to find clusters in it.
* It calculates a distance histogram searches for a threshold and partitions the
* points in two groups the ones in the cluster and everything else.
* Then the best fitting linear manifold is searched and registered as a cluster.
* The process is started over until all points are clustered.
* The last cluster should contain all the outliers. (or the whole data if no clusters have been found.)
* For details see {@link LMCLUS}.
* </PRE>
*
* @param database The database to operate on
* @param relation Relation
* @return Clustering result
*/
public Clustering<Model> run(Database database, Relation<NumberVector> relation) {
Clustering<Model> ret = new Clustering<>("LMCLUS Clustering", "lmclus-clustering");
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null;
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null;
ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());
Random r = rnd.getSingleThreadedRandom();
final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation));
int cnum = 0;
while (unclustered.size() > minsize) {
DBIDs current = unclustered;
int lmDim = 1;
for (int k = 1; k <= maxdim; k++) {
// stopping at the appropriate dimensionality either.
while (true) {
Separation separation = findSeparation(relation, current, k, r);
// " threshold: " + separation.threshold);
if (separation.goodness <= sensitivityThreshold) {
break;
}
ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
if (deviation(minusEquals(relation.get(iter).toArray(), separation.originV), separation.basis) < separation.threshold) {
subset.add(iter);
}
}
// logger.verbose("size:"+subset.size());
if (subset.size() < minsize) {
break;
}
current = subset;
lmDim = k;
// System.out.println("Partition: " + subset.size());
}
}
// No more clusters found
if (current.size() < minsize || current == unclustered) {
break;
}
// New cluster found
// TODO: annotate cluster with dimensionality
final Cluster<Model> cluster = new Cluster<>(current);
cluster.setName("Cluster_" + lmDim + "d_" + cnum);
cnum++;
ret.addToplevelCluster(cluster);
// Remove from main working set.
unclustered.removeDBIDs(current);
if (progress != null) {
progress.setProcessed(relation.size() - unclustered.size(), LOG);
}
if (cprogress != null) {
cprogress.setProcessed(cnum, LOG);
}
}
// Remaining objects are noise
if (unclustered.size() > 0) {
ret.addToplevelCluster(new Cluster<>(unclustered, true));
}
if (progress != null) {
progress.setProcessed(relation.size(), LOG);
progress.ensureCompleted(LOG);
}
LOG.setCompleted(cprogress);
return ret;
}
use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class EM method run.
/**
* Performs the EM clustering algorithm on the given database.
*
* Finally a hard clustering is provided where each clusters gets assigned the
* points exhibiting the highest probability to belong to this cluster. But
* still, the database objects hold associated the complete probability-vector
* for all models.
*
* @param database Database
* @param relation Relation
* @return Result
*/
public Clustering<M> run(Database database, Relation<V> relation) {
if (relation.size() == 0) {
throw new IllegalArgumentException("database empty: must contain elements");
}
// initial models
List<? extends EMClusterModel<M>> models = mfactory.buildInitialModels(database, relation, k, SquaredEuclideanDistanceFunction.STATIC);
WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
double loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
DoubleStatistic likestat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".loglikelihood") : null;
if (LOG.isStatistics()) {
LOG.statistics(likestat.setDouble(loglikelihood));
}
// iteration unless no change
int it = 0, lastimprovement = 0;
// For detecting instabilities.
double bestloglikelihood = loglikelihood;
for (++it; it < maxiter || maxiter < 0; it++) {
final double oldloglikelihood = loglikelihood;
recomputeCovarianceMatrices(relation, probClusterIGivenX, models, prior);
// reassign probabilities
loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
if (LOG.isStatistics()) {
LOG.statistics(likestat.setDouble(loglikelihood));
}
if (loglikelihood - bestloglikelihood > delta) {
lastimprovement = it;
bestloglikelihood = loglikelihood;
}
if (Math.abs(loglikelihood - oldloglikelihood) <= delta || lastimprovement < it >> 1) {
break;
}
}
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", it));
}
// fill result with clusters and models
List<ModifiableDBIDs> hardClusters = new ArrayList<>(k);
for (int i = 0; i < k; i++) {
hardClusters.add(DBIDUtil.newArray());
}
// provide a hard clustering
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
hardClusters.get(argmax(probClusterIGivenX.get(iditer))).add(iditer);
}
Clustering<M> result = new Clustering<>("EM Clustering", "em-clustering");
// provide models within the result
for (int i = 0; i < k; i++) {
result.addToplevelCluster(new Cluster<>(hardClusters.get(i), models.get(i).finalizeCluster()));
}
if (isSoft()) {
result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs()));
} else {
probClusterIGivenX.destroy();
}
return result;
}
Aggregations