Search in sources :

Example 21 with StringStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic in project elki by elki-project.

the class EvaluateSilhouette method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param dq Distance query
 * @param c Clustering
 * @return Average silhouette
 */
public double evaluateClustering(Database db, Relation<O> rel, DistanceQuery<O> dq, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    MeanVariance msil = new MeanVariance();
    int ignorednoise = 0;
    for (Cluster<?> cluster : clusters) {
        // Note: we treat 1-element clusters the same as noise.
        if (cluster.size() <= 1 || cluster.isNoise()) {
            switch(noiseOption) {
                case IGNORE_NOISE:
                    ignorednoise += cluster.size();
                    // Ignore noise elements
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    // As suggested in Rousseeuw, we use 0 for singletons.
                    msil.put(0., cluster.size());
                    continue;
                case MERGE_NOISE:
                    // Treat as cluster below
                    break;
            }
        }
        ArrayDBIDs ids = DBIDUtil.ensureArray(cluster.getIDs());
        // temporary storage.
        double[] as = new double[ids.size()];
        DBIDArrayIter it1 = ids.iter(), it2 = ids.iter();
        for (it1.seek(0); it1.valid(); it1.advance()) {
            // a: In-cluster distances
            // Already computed distances
            double a = as[it1.getOffset()];
            for (it2.seek(it1.getOffset() + 1); it2.valid(); it2.advance()) {
                final double dist = dq.distance(it1, it2);
                a += dist;
                as[it2.getOffset()] += dist;
            }
            a /= (ids.size() - 1);
            // b: minimum average distance to other clusters:
            double b = Double.POSITIVE_INFINITY;
            for (Cluster<?> ocluster : clusters) {
                if (ocluster == /* yes, reference identity */
                cluster) {
                    // Same cluster
                    continue;
                }
                if (ocluster.size() <= 1 || ocluster.isNoise()) {
                    switch(noiseOption) {
                        case IGNORE_NOISE:
                            // Ignore noise elements
                            continue;
                        case TREAT_NOISE_AS_SINGLETONS:
                            // Treat noise cluster as singletons:
                            for (DBIDIter it3 = ocluster.getIDs().iter(); it3.valid(); it3.advance()) {
                                final double dist = dq.distance(it1, it3);
                                // Minimum average
                                b = dist < b ? dist : b;
                            }
                            continue;
                        case MERGE_NOISE:
                            // Treat as cluster below
                            break;
                    }
                }
                final DBIDs oids = ocluster.getIDs();
                double btmp = 0.;
                for (DBIDIter it3 = oids.iter(); it3.valid(); it3.advance()) {
                    btmp += dq.distance(it1, it3);
                }
                // Average
                btmp /= oids.size();
                // Minimum average
                b = btmp < b ? btmp : b;
            }
            // One cluster only?
            b = b < Double.POSITIVE_INFINITY ? b : a;
            msil.put((b - a) / (b > a ? b : a));
        }
    }
    double penalty = 1.;
    // Only if {@link NoiseHandling#IGNORE_NOISE}:
    if (penalize && ignorednoise > 0) {
        penalty = (rel.size() - ignorednoise) / (double) rel.size();
    }
    final double meansil = penalty * msil.getMean();
    final double stdsil = penalty * msil.getSampleStddev();
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".silhouette.noise-handling", noiseOption.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".silhouette.noise", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".silhouette.mean", meansil));
        LOG.statistics(new DoubleStatistic(key + ".silhouette.stddev", stdsil));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("Silhouette +-" + FormatUtil.NF2.format(stdsil), meansil, -1., 1., 0., false);
    db.getHierarchy().resultChanged(ev);
    return meansil;
}
Also used : ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)

Example 22 with StringStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic in project elki by elki-project.

the class EvaluateSimplifiedSilhouette method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return Mean simplified silhouette
 */
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    NumberVector[] centroids = new NumberVector[clusters.size()];
    int ignorednoise = centroids(rel, clusters, centroids, noiseOption);
    MeanVariance mssil = new MeanVariance();
    Iterator<? extends Cluster<?>> ci = clusters.iterator();
    for (int i = 0; ci.hasNext(); i++) {
        Cluster<?> cluster = ci.next();
        if (cluster.size() <= 1) {
            // As suggested in Rousseeuw, we use 0 for singletons.
            mssil.put(0., cluster.size());
            continue;
        }
        if (cluster.isNoise()) {
            switch(noiseOption) {
                case IGNORE_NOISE:
                    // Ignore elements
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    // As suggested in Rousseeuw, we use 0 for singletons.
                    mssil.put(0., cluster.size());
                    continue;
                case MERGE_NOISE:
                    // Treat as cluster below
                    break;
            }
        }
        // Cluster center:
        final NumberVector center = centroids[i];
        assert (center != null);
        for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
            NumberVector obj = rel.get(it);
            // a: Distance to own centroid
            double a = distance.distance(center, obj);
            // b: Distance to other clusters centroids:
            double min = Double.POSITIVE_INFINITY;
            Iterator<? extends Cluster<?>> cj = clusters.iterator();
            for (int j = 0; cj.hasNext(); j++) {
                Cluster<?> ocluster = cj.next();
                if (i == j) {
                    continue;
                }
                NumberVector other = centroids[j];
                if (other == null) {
                    // Noise!
                    switch(noiseOption) {
                        case IGNORE_NOISE:
                            continue;
                        case TREAT_NOISE_AS_SINGLETONS:
                            // Treat each object like a centroid!
                            for (DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
                                double dist = distance.distance(rel.get(it2), obj);
                                min = dist < min ? dist : min;
                            }
                            continue;
                        case MERGE_NOISE:
                            // Treat as cluster below, but should not be reachable.
                            break;
                    }
                }
                // Clusters: use centroid.
                double dist = distance.distance(other, obj);
                min = dist < min ? dist : min;
            }
            // One 'real' cluster only?
            min = min < Double.POSITIVE_INFINITY ? min : a;
            mssil.put((min - a) / (min > a ? min : a));
        }
    }
    double penalty = 1.;
    // Only if {@link NoiseHandling#IGNORE_NOISE}:
    if (penalize && ignorednoise > 0) {
        penalty = (rel.size() - ignorednoise) / (double) rel.size();
    }
    final double meanssil = penalty * mssil.getMean();
    final double stdssil = penalty * mssil.getSampleStddev();
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".simplified-silhouette.noise-handling", noiseOption.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".simplified-silhouette.ignored", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.mean", meanssil));
        LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.stddev", stdssil));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("Simp. Silhouette +-" + FormatUtil.NF2.format(stdssil), meanssil, -1., 1., 0., false);
    db.getHierarchy().resultChanged(ev);
    return meanssil;
}
Also used : MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Aggregations

StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)22 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)19 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)17 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)14 ArrayList (java.util.ArrayList)13 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)12 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)11 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)10 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)10 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)10 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)8 EvaluationResult (de.lmu.ifi.dbs.elki.result.EvaluationResult)7 MeasurementGroup (de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup)7 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)5 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)2 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)2 MedoidModel (de.lmu.ifi.dbs.elki.data.model.MedoidModel)2 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)2 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)2 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)2