use of de.lmu.ifi.dbs.elki.database.ids.DBIDIter in project elki by elki-project.
the class LMCLUS method run.
/**
* The main LMCLUS (Linear manifold clustering algorithm) is processed in this
* method.
*
* <PRE>
* The algorithm samples random linear manifolds and tries to find clusters in it.
* It calculates a distance histogram searches for a threshold and partitions the
* points in two groups the ones in the cluster and everything else.
* Then the best fitting linear manifold is searched and registered as a cluster.
* The process is started over until all points are clustered.
* The last cluster should contain all the outliers. (or the whole data if no clusters have been found.)
* For details see {@link LMCLUS}.
* </PRE>
*
* @param database The database to operate on
* @param relation Relation
* @return Clustering result
*/
public Clustering<Model> run(Database database, Relation<NumberVector> relation) {
Clustering<Model> ret = new Clustering<>("LMCLUS Clustering", "lmclus-clustering");
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null;
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null;
ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());
Random r = rnd.getSingleThreadedRandom();
final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation));
int cnum = 0;
while (unclustered.size() > minsize) {
DBIDs current = unclustered;
int lmDim = 1;
for (int k = 1; k <= maxdim; k++) {
// stopping at the appropriate dimensionality either.
while (true) {
Separation separation = findSeparation(relation, current, k, r);
// " threshold: " + separation.threshold);
if (separation.goodness <= sensitivityThreshold) {
break;
}
ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
if (deviation(minusEquals(relation.get(iter).toArray(), separation.originV), separation.basis) < separation.threshold) {
subset.add(iter);
}
}
// logger.verbose("size:"+subset.size());
if (subset.size() < minsize) {
break;
}
current = subset;
lmDim = k;
// System.out.println("Partition: " + subset.size());
}
}
// No more clusters found
if (current.size() < minsize || current == unclustered) {
break;
}
// New cluster found
// TODO: annotate cluster with dimensionality
final Cluster<Model> cluster = new Cluster<>(current);
cluster.setName("Cluster_" + lmDim + "d_" + cnum);
cnum++;
ret.addToplevelCluster(cluster);
// Remove from main working set.
unclustered.removeDBIDs(current);
if (progress != null) {
progress.setProcessed(relation.size() - unclustered.size(), LOG);
}
if (cprogress != null) {
cprogress.setProcessed(cnum, LOG);
}
}
// Remaining objects are noise
if (unclustered.size() > 0) {
ret.addToplevelCluster(new Cluster<>(unclustered, true));
}
if (progress != null) {
progress.setProcessed(relation.size(), LOG);
progress.ensureCompleted(LOG);
}
LOG.setCompleted(cprogress);
return ret;
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDIter in project elki by elki-project.
the class EM method recomputeCovarianceMatrices.
/**
* Recompute the covariance matrixes.
*
* @param relation Vector data
* @param probClusterIGivenX Object probabilities
* @param models Cluster models to update
* @param prior MAP prior (use 0 for MLE)
*/
public static void recomputeCovarianceMatrices(Relation<? extends NumberVector> relation, WritableDataStore<double[]> probClusterIGivenX, List<? extends EMClusterModel<?>> models, double prior) {
final int k = models.size();
boolean needsTwoPass = false;
for (EMClusterModel<?> m : models) {
m.beginEStep();
needsTwoPass |= m.needsTwoPass();
}
// First pass, only for two-pass models.
if (needsTwoPass) {
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double[] clusterProbabilities = probClusterIGivenX.get(iditer);
NumberVector instance = relation.get(iditer);
for (int i = 0; i < clusterProbabilities.length; i++) {
final double prob = clusterProbabilities[i];
if (prob > 1e-10) {
models.get(i).firstPassE(instance, prob);
}
}
}
for (EMClusterModel<?> m : models) {
m.finalizeFirstPassE();
}
}
double[] wsum = new double[k];
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double[] clusterProbabilities = probClusterIGivenX.get(iditer);
NumberVector instance = relation.get(iditer);
for (int i = 0; i < clusterProbabilities.length; i++) {
final double prob = clusterProbabilities[i];
if (prob > 1e-10) {
models.get(i).updateE(instance, prob);
}
wsum[i] += prob;
}
}
for (int i = 0; i < models.size(); i++) {
EMClusterModel<?> m = models.get(i);
// MLE / MAP
final double weight = prior <= 0. ? wsum[i] / relation.size() : (wsum[i] + prior - 1) / (relation.size() + prior * k - k);
m.finalizeEStep(weight, prior);
}
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDIter in project elki by elki-project.
the class EM method run.
/**
* Performs the EM clustering algorithm on the given database.
*
* Finally a hard clustering is provided where each clusters gets assigned the
* points exhibiting the highest probability to belong to this cluster. But
* still, the database objects hold associated the complete probability-vector
* for all models.
*
* @param database Database
* @param relation Relation
* @return Result
*/
public Clustering<M> run(Database database, Relation<V> relation) {
if (relation.size() == 0) {
throw new IllegalArgumentException("database empty: must contain elements");
}
// initial models
List<? extends EMClusterModel<M>> models = mfactory.buildInitialModels(database, relation, k, SquaredEuclideanDistanceFunction.STATIC);
WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
double loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
DoubleStatistic likestat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".loglikelihood") : null;
if (LOG.isStatistics()) {
LOG.statistics(likestat.setDouble(loglikelihood));
}
// iteration unless no change
int it = 0, lastimprovement = 0;
// For detecting instabilities.
double bestloglikelihood = loglikelihood;
for (++it; it < maxiter || maxiter < 0; it++) {
final double oldloglikelihood = loglikelihood;
recomputeCovarianceMatrices(relation, probClusterIGivenX, models, prior);
// reassign probabilities
loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
if (LOG.isStatistics()) {
LOG.statistics(likestat.setDouble(loglikelihood));
}
if (loglikelihood - bestloglikelihood > delta) {
lastimprovement = it;
bestloglikelihood = loglikelihood;
}
if (Math.abs(loglikelihood - oldloglikelihood) <= delta || lastimprovement < it >> 1) {
break;
}
}
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", it));
}
// fill result with clusters and models
List<ModifiableDBIDs> hardClusters = new ArrayList<>(k);
for (int i = 0; i < k; i++) {
hardClusters.add(DBIDUtil.newArray());
}
// provide a hard clustering
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
hardClusters.get(argmax(probClusterIGivenX.get(iditer))).add(iditer);
}
Clustering<M> result = new Clustering<>("EM Clustering", "em-clustering");
// provide models within the result
for (int i = 0; i < k; i++) {
result.addToplevelCluster(new Cluster<>(hardClusters.get(i), models.get(i).finalizeCluster()));
}
if (isSoft()) {
result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs()));
} else {
probClusterIGivenX.destroy();
}
return result;
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDIter in project elki by elki-project.
the class KMeansElkan method initialAssignToNearestCluster.
/**
* Reassign objects, but only if their bounds indicate it is necessary to do
* so.
*
* @param relation Data
* @param means Current means
* @param sums New means
* @param clusters Current clusters
* @param assignment Cluster assignment
* @param upper Upper bounds
* @param lower Lower bounds
* @return Number of changes (i.e. relation size)
*/
private int initialAssignToNearestCluster(Relation<V> relation, double[][] means, double[][] sums, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, WritableDoubleDataStore upper, WritableDataStore<double[]> lower) {
assert (k == means.length);
final boolean issquared = distanceFunction.isSquared();
for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
V fv = relation.get(it);
double[] l = lower.get(it);
// Check all (other) means:
double best = Double.POSITIVE_INFINITY;
int cur = -1;
for (int j = 0; j < k; j++) {
double dist = distanceFunction.distance(fv, DoubleVector.wrap(means[j]));
dist = issquared ? FastMath.sqrt(dist) : dist;
l[j] = dist;
if (dist < best) {
cur = j;
best = dist;
}
}
// Assign to nearest cluster.
ModifiableDBIDs newc = clusters.get(cur);
newc.add(it);
assignment.putInt(it, cur);
upper.putDouble(it, best);
double[] newmean = sums[cur];
for (int d = 0; d < fv.getDimensionality(); d++) {
newmean[d] += fv.doubleValue(d);
}
}
return relation.size();
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDIter in project elki by elki-project.
the class KMeansElkan method assignToNearestCluster.
/**
* Reassign objects, but only if their bounds indicate it is necessary to do
* so.
*
* @param relation Data
* @param means Current means
* @param sums New means
* @param clusters Current clusters
* @param assignment Cluster assignment
* @param sep Separation of means
* @param cdist Center-to-center distances
* @param upper Upper bounds
* @param lower Lower bounds
* @return true when the object was reassigned
*/
private int assignToNearestCluster(Relation<V> relation, double[][] means, double[][] sums, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] sep, double[][] cdist, WritableDoubleDataStore upper, WritableDataStore<double[]> lower) {
assert (k == means.length);
final boolean issquared = distanceFunction.isSquared();
int changed = 0;
for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
final int orig = assignment.intValue(it);
double u = upper.doubleValue(it);
// Upper bound check (#2):
if (u <= sep[orig]) {
continue;
}
// Elkan's r(x)
boolean recompute_u = true;
V fv = relation.get(it);
double[] l = lower.get(it);
// Check all (other) means:
int cur = orig;
for (int j = 0; j < k; j++) {
if (orig == j || u <= l[j] || u <= cdist[cur][j]) {
// Condition #3 i-iii not satisfied
continue;
}
if (recompute_u) {
// Need to update bound? #3a
u = distanceFunction.distance(fv, DoubleVector.wrap(means[cur]));
u = issquared ? FastMath.sqrt(u) : u;
upper.putDouble(it, u);
// Once only
recompute_u = false;
if (u <= l[j] || u <= cdist[cur][j]) {
// #3b
continue;
}
}
double dist = distanceFunction.distance(fv, DoubleVector.wrap(means[j]));
dist = issquared ? FastMath.sqrt(dist) : dist;
l[j] = dist;
if (dist < u) {
cur = j;
u = dist;
}
}
// Object is to be reassigned.
if (cur != orig) {
// Remember bound.
upper.putDouble(it, u);
ModifiableDBIDs newc = clusters.get(cur);
newc.add(it);
assignment.putInt(it, cur);
double[] newmean = sums[cur];
ModifiableDBIDs oldc = clusters.get(orig);
oldc.remove(it);
double[] oldmean = sums[orig];
for (int d = 0; d < fv.getDimensionality(); d++) {
final double v = fv.doubleValue(d);
newmean[d] += v;
oldmean[d] -= v;
}
++changed;
}
}
return changed;
}
Aggregations