Search in sources :

Example 1 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class EvaluateVarianceRatioCriteria method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return Variance Ratio Criteria
 */
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    // FIXME: allow using a precomputed distance matrix!
    final SquaredEuclideanDistanceFunction df = SquaredEuclideanDistanceFunction.STATIC;
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    double vrc = 0.;
    int ignorednoise = 0;
    if (clusters.size() > 1) {
        NumberVector[] centroids = new NumberVector[clusters.size()];
        ignorednoise = EvaluateSimplifiedSilhouette.centroids(rel, clusters, centroids, noiseOption);
        // Build global centroid and cluster count:
        final int dim = RelationUtil.dimensionality(rel);
        Centroid overallCentroid = new Centroid(dim);
        int clustercount = globalCentroid(overallCentroid, rel, clusters, centroids, noiseOption);
        // a: Distance to own centroid
        // b: Distance to overall centroid
        double a = 0, b = 0;
        Iterator<? extends Cluster<?>> ci = clusters.iterator();
        for (int i = 0; ci.hasNext(); i++) {
            Cluster<?> cluster = ci.next();
            if (cluster.size() <= 1 || cluster.isNoise()) {
                switch(noiseOption) {
                    case IGNORE_NOISE:
                        // Ignored
                        continue;
                    case TREAT_NOISE_AS_SINGLETONS:
                        // Singletons: a = 0 by definition.
                        for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
                            b += df.distance(overallCentroid, rel.get(it));
                        }
                        // with NEXT cluster.
                        continue;
                    case MERGE_NOISE:
                        // Treat like a cluster below:
                        break;
                }
            }
            for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
                NumberVector vec = rel.get(it);
                a += df.distance(centroids[i], vec);
                b += df.distance(overallCentroid, vec);
            }
        }
        vrc = ((b - a) / a) * ((rel.size() - clustercount) / (clustercount - 1.));
        // Only if {@link NoiseHandling#IGNORE_NOISE}:
        if (penalize && ignorednoise > 0) {
            vrc *= (rel.size() - ignorednoise) / (double) rel.size();
        }
    }
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".vrc.noise-handling", noiseOption.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".vrc.ignored", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".vrc", vrc));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("Variance Ratio Criteria", vrc, 0., 1., 0., false);
    return vrc;
}
Also used : MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) Centroid(de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) SquaredEuclideanDistanceFunction(de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 2 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class APRIORI method buildFrequentOneItemsets.

/**
 * Build the 1-itemsets.
 *
 * @param relation Data relation
 * @param dim Maximum dimensionality
 * @param needed Minimum support needed
 * @return 1-itemsets
 */
protected List<OneItemset> buildFrequentOneItemsets(final Relation<? extends SparseFeatureVector<?>> relation, final int dim, final int needed) {
    // TODO: use TIntList and prefill appropriately to avoid knowing "dim"
    // beforehand?
    int[] counts = new int[dim];
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        SparseFeatureVector<?> bv = relation.get(iditer);
        for (int it = bv.iter(); bv.iterValid(it); it = bv.iterAdvance(it)) {
            counts[bv.iterDim(it)]++;
        }
    }
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(STAT + "1-items.candidates", dim));
    }
    // Generate initial candidates of length 1.
    List<OneItemset> frequent = new ArrayList<>(dim);
    for (int i = 0; i < dim; i++) {
        if (counts[i] >= needed) {
            frequent.add(new OneItemset(i, counts[i]));
        }
    }
    return frequent;
}
Also used : LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ArrayList(java.util.ArrayList) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 3 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class EM method run.

/**
 * Performs the EM clustering algorithm on the given database.
 *
 * Finally a hard clustering is provided where each clusters gets assigned the
 * points exhibiting the highest probability to belong to this cluster. But
 * still, the database objects hold associated the complete probability-vector
 * for all models.
 *
 * @param database Database
 * @param relation Relation
 * @return Result
 */
public Clustering<M> run(Database database, Relation<V> relation) {
    if (relation.size() == 0) {
        throw new IllegalArgumentException("database empty: must contain elements");
    }
    // initial models
    List<? extends EMClusterModel<M>> models = mfactory.buildInitialModels(database, relation, k, SquaredEuclideanDistanceFunction.STATIC);
    WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
    double loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
    DoubleStatistic likestat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".loglikelihood") : null;
    if (LOG.isStatistics()) {
        LOG.statistics(likestat.setDouble(loglikelihood));
    }
    // iteration unless no change
    int it = 0, lastimprovement = 0;
    // For detecting instabilities.
    double bestloglikelihood = loglikelihood;
    for (++it; it < maxiter || maxiter < 0; it++) {
        final double oldloglikelihood = loglikelihood;
        recomputeCovarianceMatrices(relation, probClusterIGivenX, models, prior);
        // reassign probabilities
        loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
        if (LOG.isStatistics()) {
            LOG.statistics(likestat.setDouble(loglikelihood));
        }
        if (loglikelihood - bestloglikelihood > delta) {
            lastimprovement = it;
            bestloglikelihood = loglikelihood;
        }
        if (Math.abs(loglikelihood - oldloglikelihood) <= delta || lastimprovement < it >> 1) {
            break;
        }
    }
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", it));
    }
    // fill result with clusters and models
    List<ModifiableDBIDs> hardClusters = new ArrayList<>(k);
    for (int i = 0; i < k; i++) {
        hardClusters.add(DBIDUtil.newArray());
    }
    // provide a hard clustering
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        hardClusters.get(argmax(probClusterIGivenX.get(iditer))).add(iditer);
    }
    Clustering<M> result = new Clustering<>("EM Clustering", "em-clustering");
    // provide models within the result
    for (int i = 0; i < k; i++) {
        result.addToplevelCluster(new Cluster<>(hardClusters.get(i), models.get(i).finalizeCluster()));
    }
    if (isSoft()) {
        result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs()));
    } else {
        probClusterIGivenX.destroy();
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 4 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class KMedoidsEM method run.

/**
 * Run k-medoids
 *
 * @param database Database
 * @param relation relation to use
 * @return result
 */
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
    }
    DistanceQuery<V> distQ = null;
    // Only enforce a distance matrix for PAM initialization, which is slow.
    if (initializer instanceof PAMInitialMeans) {
        distQ = DatabaseUtil.precomputedDistanceQuery(database, relation, getDistanceFunction(), LOG);
    } else {
        distQ = database.getDistanceQuery(relation, getDistanceFunction());
    }
    // Choose initial medoids
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, relation.getDBIDs(), distQ));
    DBIDArrayMIter miter = medoids.iter();
    double[] mdists = new double[k];
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        HashSetModifiableDBIDs set = DBIDUtil.newHashSet(relation.size() / k);
        // Add medoids.
        set.add(miter.seek(i));
        clusters.add(set);
    }
    // Initial assignment to nearest medoids
    // TODO: reuse this information, from the build phase, when possible?
    double tc = assignToNearestCluster(miter, mdists, clusters, distQ);
    if (LOG.isStatistics()) {
        LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + 0 + ".cost", tc));
    }
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids EM iteration", LOG) : null;
    // Swap phase
    int iteration = 0;
    DBIDVar best = DBIDUtil.newVar();
    while (true) {
        boolean changed = false;
        // Try to swap the medoid with a better cluster member:
        int i = 0;
        for (miter.seek(0); miter.valid(); miter.advance(), i++) {
            best.unset();
            double bestm = mdists[i];
            for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
                if (DBIDUtil.equal(miter, iter)) {
                    continue;
                }
                double sum = 0;
                for (DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
                    sum += distQ.distance(iter, iter2);
                }
                if (sum < bestm) {
                    best.set(iter);
                    bestm = sum;
                }
            }
            if (best.isSet() && !DBIDUtil.equal(miter, best)) {
                changed = true;
                assert (clusters.get(i).contains(best));
                medoids.set(i, best);
                mdists[i] = bestm;
            }
        }
        // Reassign
        if (!changed) {
            break;
        }
        double nc = assignToNearestCluster(miter, mdists, clusters, distQ);
        ++iteration;
        if (LOG.isStatistics()) {
            LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", nc));
        }
        LOG.incrementProcessed(prog);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    // Wrap result
    Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
    for (DBIDArrayIter it = medoids.iter(); it.valid(); it.advance()) {
        result.addToplevelCluster(new Cluster<>(clusters.get(it.getOffset()), new MedoidModel(DBIDUtil.deref(it))));
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) PAMInitialMeans(de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PAMInitialMeans) MedoidModel(de.lmu.ifi.dbs.elki.data.model.MedoidModel) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 5 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class AbstractMTree method logStatistics.

@Override
public void logStatistics() {
    super.logStatistics();
    Logging log = getLogger();
    if (log.isStatistics()) {
        log.statistics(new LongStatistic(this.getClass().getName() + ".height", getHeight()));
        statistics.logStatistics();
    }
}
Also used : Logging(de.lmu.ifi.dbs.elki.logging.Logging) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Aggregations

LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)44 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)27 ArrayList (java.util.ArrayList)20 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)19 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)17 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)14 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)14 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)14 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)12 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)11 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)10 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)9 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)8 EvaluationResult (de.lmu.ifi.dbs.elki.result.EvaluationResult)7 MeasurementGroup (de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup)7 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)5 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)5 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)4 Logging (de.lmu.ifi.dbs.elki.logging.Logging)4 Duration (de.lmu.ifi.dbs.elki.logging.statistics.Duration)4