Search in sources :

Example 26 with DBIDIter

use of de.lmu.ifi.dbs.elki.database.ids.DBIDIter in project elki by elki-project.

the class LMCLUS method run.

/**
 * The main LMCLUS (Linear manifold clustering algorithm) is processed in this
 * method.
 *
 * <PRE>
 * The algorithm samples random linear manifolds and tries to find clusters in it.
 * It calculates a distance histogram searches for a threshold and partitions the
 * points in two groups the ones in the cluster and everything else.
 * Then the best fitting linear manifold is searched and registered as a cluster.
 * The process is started over until all points are clustered.
 * The last cluster should contain all the outliers. (or the whole data if no clusters have been found.)
 * For details see {@link LMCLUS}.
 * </PRE>
 *
 * @param database The database to operate on
 * @param relation Relation
 * @return Clustering result
 */
public Clustering<Model> run(Database database, Relation<NumberVector> relation) {
    Clustering<Model> ret = new Clustering<>("LMCLUS Clustering", "lmclus-clustering");
    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null;
    IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null;
    ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());
    Random r = rnd.getSingleThreadedRandom();
    final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation));
    int cnum = 0;
    while (unclustered.size() > minsize) {
        DBIDs current = unclustered;
        int lmDim = 1;
        for (int k = 1; k <= maxdim; k++) {
            // stopping at the appropriate dimensionality either.
            while (true) {
                Separation separation = findSeparation(relation, current, k, r);
                // " threshold: " + separation.threshold);
                if (separation.goodness <= sensitivityThreshold) {
                    break;
                }
                ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
                for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
                    if (deviation(minusEquals(relation.get(iter).toArray(), separation.originV), separation.basis) < separation.threshold) {
                        subset.add(iter);
                    }
                }
                // logger.verbose("size:"+subset.size());
                if (subset.size() < minsize) {
                    break;
                }
                current = subset;
                lmDim = k;
            // System.out.println("Partition: " + subset.size());
            }
        }
        // No more clusters found
        if (current.size() < minsize || current == unclustered) {
            break;
        }
        // New cluster found
        // TODO: annotate cluster with dimensionality
        final Cluster<Model> cluster = new Cluster<>(current);
        cluster.setName("Cluster_" + lmDim + "d_" + cnum);
        cnum++;
        ret.addToplevelCluster(cluster);
        // Remove from main working set.
        unclustered.removeDBIDs(current);
        if (progress != null) {
            progress.setProcessed(relation.size() - unclustered.size(), LOG);
        }
        if (cprogress != null) {
            cprogress.setProcessed(cnum, LOG);
        }
    }
    // Remaining objects are noise
    if (unclustered.size() > 0) {
        ret.addToplevelCluster(new Cluster<>(unclustered, true));
    }
    if (progress != null) {
        progress.setProcessed(relation.size(), LOG);
        progress.ensureCompleted(LOG);
    }
    LOG.setCompleted(cprogress);
    return ret;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) Random(java.util.Random) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) Model(de.lmu.ifi.dbs.elki.data.model.Model) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 27 with DBIDIter

use of de.lmu.ifi.dbs.elki.database.ids.DBIDIter in project elki by elki-project.

the class EM method recomputeCovarianceMatrices.

/**
 * Recompute the covariance matrixes.
 *
 * @param relation Vector data
 * @param probClusterIGivenX Object probabilities
 * @param models Cluster models to update
 * @param prior MAP prior (use 0 for MLE)
 */
public static void recomputeCovarianceMatrices(Relation<? extends NumberVector> relation, WritableDataStore<double[]> probClusterIGivenX, List<? extends EMClusterModel<?>> models, double prior) {
    final int k = models.size();
    boolean needsTwoPass = false;
    for (EMClusterModel<?> m : models) {
        m.beginEStep();
        needsTwoPass |= m.needsTwoPass();
    }
    // First pass, only for two-pass models.
    if (needsTwoPass) {
        for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
            double[] clusterProbabilities = probClusterIGivenX.get(iditer);
            NumberVector instance = relation.get(iditer);
            for (int i = 0; i < clusterProbabilities.length; i++) {
                final double prob = clusterProbabilities[i];
                if (prob > 1e-10) {
                    models.get(i).firstPassE(instance, prob);
                }
            }
        }
        for (EMClusterModel<?> m : models) {
            m.finalizeFirstPassE();
        }
    }
    double[] wsum = new double[k];
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        double[] clusterProbabilities = probClusterIGivenX.get(iditer);
        NumberVector instance = relation.get(iditer);
        for (int i = 0; i < clusterProbabilities.length; i++) {
            final double prob = clusterProbabilities[i];
            if (prob > 1e-10) {
                models.get(i).updateE(instance, prob);
            }
            wsum[i] += prob;
        }
    }
    for (int i = 0; i < models.size(); i++) {
        EMClusterModel<?> m = models.get(i);
        // MLE / MAP
        final double weight = prior <= 0. ? wsum[i] / relation.size() : (wsum[i] + prior - 1) / (relation.size() + prior * k - k);
        m.finalizeEStep(weight, prior);
    }
}
Also used : NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 28 with DBIDIter

use of de.lmu.ifi.dbs.elki.database.ids.DBIDIter in project elki by elki-project.

the class EM method run.

/**
 * Performs the EM clustering algorithm on the given database.
 *
 * Finally a hard clustering is provided where each clusters gets assigned the
 * points exhibiting the highest probability to belong to this cluster. But
 * still, the database objects hold associated the complete probability-vector
 * for all models.
 *
 * @param database Database
 * @param relation Relation
 * @return Result
 */
public Clustering<M> run(Database database, Relation<V> relation) {
    if (relation.size() == 0) {
        throw new IllegalArgumentException("database empty: must contain elements");
    }
    // initial models
    List<? extends EMClusterModel<M>> models = mfactory.buildInitialModels(database, relation, k, SquaredEuclideanDistanceFunction.STATIC);
    WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
    double loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
    DoubleStatistic likestat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".loglikelihood") : null;
    if (LOG.isStatistics()) {
        LOG.statistics(likestat.setDouble(loglikelihood));
    }
    // iteration unless no change
    int it = 0, lastimprovement = 0;
    // For detecting instabilities.
    double bestloglikelihood = loglikelihood;
    for (++it; it < maxiter || maxiter < 0; it++) {
        final double oldloglikelihood = loglikelihood;
        recomputeCovarianceMatrices(relation, probClusterIGivenX, models, prior);
        // reassign probabilities
        loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
        if (LOG.isStatistics()) {
            LOG.statistics(likestat.setDouble(loglikelihood));
        }
        if (loglikelihood - bestloglikelihood > delta) {
            lastimprovement = it;
            bestloglikelihood = loglikelihood;
        }
        if (Math.abs(loglikelihood - oldloglikelihood) <= delta || lastimprovement < it >> 1) {
            break;
        }
    }
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", it));
    }
    // fill result with clusters and models
    List<ModifiableDBIDs> hardClusters = new ArrayList<>(k);
    for (int i = 0; i < k; i++) {
        hardClusters.add(DBIDUtil.newArray());
    }
    // provide a hard clustering
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        hardClusters.get(argmax(probClusterIGivenX.get(iditer))).add(iditer);
    }
    Clustering<M> result = new Clustering<>("EM Clustering", "em-clustering");
    // provide models within the result
    for (int i = 0; i < k; i++) {
        result.addToplevelCluster(new Cluster<>(hardClusters.get(i), models.get(i).finalizeCluster()));
    }
    if (isSoft()) {
        result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs()));
    } else {
        probClusterIGivenX.destroy();
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 29 with DBIDIter

use of de.lmu.ifi.dbs.elki.database.ids.DBIDIter in project elki by elki-project.

the class KMeansElkan method initialAssignToNearestCluster.

/**
 * Reassign objects, but only if their bounds indicate it is necessary to do
 * so.
 *
 * @param relation Data
 * @param means Current means
 * @param sums New means
 * @param clusters Current clusters
 * @param assignment Cluster assignment
 * @param upper Upper bounds
 * @param lower Lower bounds
 * @return Number of changes (i.e. relation size)
 */
private int initialAssignToNearestCluster(Relation<V> relation, double[][] means, double[][] sums, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, WritableDoubleDataStore upper, WritableDataStore<double[]> lower) {
    assert (k == means.length);
    final boolean issquared = distanceFunction.isSquared();
    for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
        V fv = relation.get(it);
        double[] l = lower.get(it);
        // Check all (other) means:
        double best = Double.POSITIVE_INFINITY;
        int cur = -1;
        for (int j = 0; j < k; j++) {
            double dist = distanceFunction.distance(fv, DoubleVector.wrap(means[j]));
            dist = issquared ? FastMath.sqrt(dist) : dist;
            l[j] = dist;
            if (dist < best) {
                cur = j;
                best = dist;
            }
        }
        // Assign to nearest cluster.
        ModifiableDBIDs newc = clusters.get(cur);
        newc.add(it);
        assignment.putInt(it, cur);
        upper.putDouble(it, best);
        double[] newmean = sums[cur];
        for (int d = 0; d < fv.getDimensionality(); d++) {
            newmean[d] += fv.doubleValue(d);
        }
    }
    return relation.size();
}
Also used : ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 30 with DBIDIter

use of de.lmu.ifi.dbs.elki.database.ids.DBIDIter in project elki by elki-project.

the class KMeansElkan method assignToNearestCluster.

/**
 * Reassign objects, but only if their bounds indicate it is necessary to do
 * so.
 *
 * @param relation Data
 * @param means Current means
 * @param sums New means
 * @param clusters Current clusters
 * @param assignment Cluster assignment
 * @param sep Separation of means
 * @param cdist Center-to-center distances
 * @param upper Upper bounds
 * @param lower Lower bounds
 * @return true when the object was reassigned
 */
private int assignToNearestCluster(Relation<V> relation, double[][] means, double[][] sums, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] sep, double[][] cdist, WritableDoubleDataStore upper, WritableDataStore<double[]> lower) {
    assert (k == means.length);
    final boolean issquared = distanceFunction.isSquared();
    int changed = 0;
    for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
        final int orig = assignment.intValue(it);
        double u = upper.doubleValue(it);
        // Upper bound check (#2):
        if (u <= sep[orig]) {
            continue;
        }
        // Elkan's r(x)
        boolean recompute_u = true;
        V fv = relation.get(it);
        double[] l = lower.get(it);
        // Check all (other) means:
        int cur = orig;
        for (int j = 0; j < k; j++) {
            if (orig == j || u <= l[j] || u <= cdist[cur][j]) {
                // Condition #3 i-iii not satisfied
                continue;
            }
            if (recompute_u) {
                // Need to update bound? #3a
                u = distanceFunction.distance(fv, DoubleVector.wrap(means[cur]));
                u = issquared ? FastMath.sqrt(u) : u;
                upper.putDouble(it, u);
                // Once only
                recompute_u = false;
                if (u <= l[j] || u <= cdist[cur][j]) {
                    // #3b
                    continue;
                }
            }
            double dist = distanceFunction.distance(fv, DoubleVector.wrap(means[j]));
            dist = issquared ? FastMath.sqrt(dist) : dist;
            l[j] = dist;
            if (dist < u) {
                cur = j;
                u = dist;
            }
        }
        // Object is to be reassigned.
        if (cur != orig) {
            // Remember bound.
            upper.putDouble(it, u);
            ModifiableDBIDs newc = clusters.get(cur);
            newc.add(it);
            assignment.putInt(it, cur);
            double[] newmean = sums[cur];
            ModifiableDBIDs oldc = clusters.get(orig);
            oldc.remove(it);
            double[] oldmean = sums[orig];
            for (int d = 0; d < fv.getDimensionality(); d++) {
                final double v = fv.doubleValue(d);
                newmean[d] += v;
                oldmean[d] -= v;
            }
            ++changed;
        }
    }
    return changed;
}
Also used : ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Aggregations

DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)329 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)78 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)76 DoubleRelation (de.lmu.ifi.dbs.elki.database.relation.DoubleRelation)72 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)70 ArrayList (java.util.ArrayList)61 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)56 OutlierResult (de.lmu.ifi.dbs.elki.result.outlier.OutlierResult)56 MaterializedDoubleRelation (de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)55 OutlierScoreMeta (de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta)55 DoubleMinMax (de.lmu.ifi.dbs.elki.math.DoubleMinMax)54 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)53 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)42 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)40 DoubleDBIDListIter (de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter)34 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)31 BasicOutlierScoreMeta (de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta)30 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)25 ModifiableDoubleDBIDList (de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList)24 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)21