Search in sources :

Example 61 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class EvaluateConcordantPairs method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return Gamma index
 */
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    int ignorednoise = 0, withinPairs = 0;
    for (Cluster<?> cluster : clusters) {
        if ((cluster.size() <= 1 || cluster.isNoise())) {
            switch(noiseHandling) {
                case IGNORE_NOISE:
                    ignorednoise += cluster.size();
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    // No concordant distances.
                    continue;
                case MERGE_NOISE:
                    // Treat like a cluster below.
                    break;
            }
        }
        withinPairs += (cluster.size() * (cluster.size() - 1)) >>> 1;
        if (withinPairs < 0) {
            throw new AbortException("Integer overflow - clusters too large to compute pairwise distances.");
        }
    }
    // Materialize within-cluster distances (sorted):
    double[] withinDistances = computeWithinDistances(rel, clusters, withinPairs);
    int[] withinTies = new int[withinDistances.length];
    // Count ties within
    countTies(withinDistances, withinTies);
    long concordantPairs = 0, discordantPairs = 0, betweenPairs = 0;
    // Step two, compute discordant distances:
    for (int i = 0; i < clusters.size(); i++) {
        Cluster<?> ocluster1 = clusters.get(i);
        if (// 
        (ocluster1.size() <= 1 || ocluster1.isNoise()) && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) {
            continue;
        }
        for (int j = i + 1; j < clusters.size(); j++) {
            Cluster<?> ocluster2 = clusters.get(j);
            if (// 
            (ocluster2.size() <= 1 || ocluster2.isNoise()) && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) {
                continue;
            }
            betweenPairs += ocluster1.size() * ocluster2.size();
            for (DBIDIter oit1 = ocluster1.getIDs().iter(); oit1.valid(); oit1.advance()) {
                NumberVector obj = rel.get(oit1);
                for (DBIDIter oit2 = ocluster2.getIDs().iter(); oit2.valid(); oit2.advance()) {
                    double dist = distanceFunction.distance(obj, rel.get(oit2));
                    int p = Arrays.binarySearch(withinDistances, dist);
                    if (p >= 0) {
                        // Tied distances:
                        while (p > 0 && withinDistances[p - 1] >= dist) {
                            --p;
                        }
                        concordantPairs += p;
                        discordantPairs += withinDistances.length - p - withinTies[p];
                        continue;
                    }
                    p = -p - 1;
                    concordantPairs += p;
                    discordantPairs += withinDistances.length - p;
                }
            }
        }
    }
    // Total number of pairs possible:
    final long t = ((rel.size() - ignorednoise) * (long) (rel.size() - ignorednoise - 1)) >>> 1;
    final long tt = (t * (t - 1)) >>> 1;
    double gamma = (concordantPairs - discordantPairs) / (double) (concordantPairs + discordantPairs);
    double tau = computeTau(concordantPairs, discordantPairs, tt, withinDistances.length, betweenPairs);
    // Avoid NaN when everything is in a single cluster:
    gamma = gamma > 0. ? gamma : 0.;
    tau = tau > 0. ? tau : 0.;
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".pbm.noise-handling", noiseHandling.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".pbm.ignored", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".gamma", gamma));
        LOG.statistics(new DoubleStatistic(key + ".tau", tau));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Concordance-based Evaluation");
    g.addMeasure("Gamma", gamma, -1., 1., 0., false);
    g.addMeasure("Tau", tau, -1., +1., 0., false);
    db.getHierarchy().resultChanged(ev);
    return gamma;
}
Also used : MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 62 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class EvaluateDaviesBouldin method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return DB-index
 */
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    NumberVector[] centroids = new NumberVector[clusters.size()];
    int noisecount = EvaluateSimplifiedSilhouette.centroids(rel, clusters, centroids, noiseOption);
    double[] withinGroupDistance = withinGroupDistances(rel, clusters, centroids);
    Mean daviesBouldin = new Mean();
    for (int i = 0; i < clusters.size(); i++) {
        final NumberVector centroid = centroids[i];
        final double withinGroupDistancei = withinGroupDistance[i];
        // maximum within-to-between cluster spread
        double max = 0;
        for (int j = 0; j < clusters.size(); j++) {
            NumberVector ocentroid = centroids[j];
            if (ocentroid == centroid) {
                continue;
            }
            // Both are real clusters:
            if (centroid != null && ocentroid != null) {
                // bD = between group distance
                double bD = distanceFunction.distance(centroid, ocentroid);
                // d = within-to-between cluster spread
                double d = (withinGroupDistancei + withinGroupDistance[j]) / bD;
                max = d > max ? d : max;
            } else if (noiseOption != NoiseHandling.IGNORE_NOISE) {
                if (centroid != null) {
                    double d = Double.POSITIVE_INFINITY;
                    // Find the closest element
                    for (DBIDIter it = clusters.get(j).getIDs().iter(); it.valid(); it.advance()) {
                        double d2 = distanceFunction.distance(centroid, rel.get(it));
                        d = d2 < d ? d2 : d;
                    }
                    d = withinGroupDistancei / d;
                    max = d > max ? d : max;
                } else if (ocentroid != null) {
                    double d = Double.POSITIVE_INFINITY;
                    // Find the closest element
                    for (DBIDIter it = clusters.get(i).getIDs().iter(); it.valid(); it.advance()) {
                        double d2 = distanceFunction.distance(rel.get(it), ocentroid);
                        d = d2 < d ? d2 : d;
                    }
                    d = withinGroupDistance[j] / d;
                    max = d > max ? d : max;
                }
            // else: (0+0) / d = 0.
            }
        }
        daviesBouldin.put(max);
    }
    // For a single cluster, we return 2 (result for equidistant points)
    final double daviesBouldinMean = daviesBouldin.getCount() > 1 ? daviesBouldin.getMean() : 2.;
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".db-index.noise-handling", noiseOption.toString()));
        if (noisecount > 0) {
            LOG.statistics(new LongStatistic(key + ".db-index.ignored", noisecount));
        }
        LOG.statistics(new DoubleStatistic(key + ".db-index", daviesBouldinMean));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("Davies Bouldin Index", daviesBouldinMean, 0., Double.POSITIVE_INFINITY, 0., true);
    db.getHierarchy().resultChanged(ev);
    return daviesBouldinMean;
}
Also used : Mean(de.lmu.ifi.dbs.elki.math.Mean) MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 63 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class EvaluatePBMIndex method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return PBM
 */
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    NumberVector[] centroids = new NumberVector[clusters.size()];
    int ignorednoise = EvaluateSimplifiedSilhouette.centroids(rel, clusters, centroids, noiseHandling);
    // Build global centroid and cluster count:
    final int dim = RelationUtil.dimensionality(rel);
    Centroid overallCentroid = new Centroid(dim);
    EvaluateVarianceRatioCriteria.globalCentroid(overallCentroid, rel, clusters, centroids, noiseHandling);
    // Maximum distance between centroids:
    double max = 0;
    for (int i = 0; i < centroids.length; i++) {
        if (centroids[i] == null && noiseHandling != NoiseHandling.TREAT_NOISE_AS_SINGLETONS) {
            continue;
        }
        for (int j = i + 1; j < centroids.length; j++) {
            if (centroids[j] == null && noiseHandling != NoiseHandling.TREAT_NOISE_AS_SINGLETONS) {
                continue;
            }
            if (centroids[i] == null && centroids[j] == null) {
                // Need to compute pairwise distances of noise clusters.
                for (DBIDIter iti = clusters.get(i).getIDs().iter(); iti.valid(); iti.advance()) {
                    for (DBIDIter itj = clusters.get(j).getIDs().iter(); itj.valid(); itj.advance()) {
                        double dist = distanceFunction.distance(rel.get(iti), rel.get(itj));
                        max = dist > max ? dist : max;
                    }
                }
            } else if (centroids[i] == null) {
                for (DBIDIter iti = clusters.get(i).getIDs().iter(); iti.valid(); iti.advance()) {
                    double dist = distanceFunction.distance(rel.get(iti), centroids[j]);
                    max = dist > max ? dist : max;
                }
            } else if (centroids[j] == null) {
                for (DBIDIter itj = clusters.get(j).getIDs().iter(); itj.valid(); itj.advance()) {
                    double dist = distanceFunction.distance(centroids[i], rel.get(itj));
                    max = dist > max ? dist : max;
                }
            } else {
                double dist = distanceFunction.distance(centroids[i], centroids[j]);
                max = dist > max ? dist : max;
            }
        }
    }
    // a: Distance to own centroid
    // b: Distance to overall centroid
    double a = 0, b = 0;
    Iterator<? extends Cluster<?>> ci = clusters.iterator();
    for (int i = 0; ci.hasNext(); i++) {
        Cluster<?> cluster = ci.next();
        if (cluster.size() <= 1 || cluster.isNoise()) {
            switch(noiseHandling) {
                case IGNORE_NOISE:
                    // Ignored
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    // Singletons: a = 0 by definition.
                    for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
                        b += SquaredEuclideanDistanceFunction.STATIC.distance(overallCentroid, rel.get(it));
                    }
                    // with NEXT cluster.
                    continue;
                case MERGE_NOISE:
                    // Treat like a cluster below:
                    break;
            }
        }
        for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
            NumberVector obj = rel.get(it);
            a += distanceFunction.distance(centroids[i], obj);
            b += distanceFunction.distance(overallCentroid, obj);
        }
    }
    final double pbm = FastMath.pow((1. / centroids.length) * (b / a) * max, 2.);
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".pbm.noise-handling", noiseHandling.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".pbm.ignored", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".pbm", pbm));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("PBM-Index", pbm, 0., Double.POSITIVE_INFINITY, 0., false);
    db.getHierarchy().resultChanged(ev);
    return pbm;
}
Also used : MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) Centroid(de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 64 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class EvaluateSimplifiedSilhouette method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return Mean simplified silhouette
 */
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    NumberVector[] centroids = new NumberVector[clusters.size()];
    int ignorednoise = centroids(rel, clusters, centroids, noiseOption);
    MeanVariance mssil = new MeanVariance();
    Iterator<? extends Cluster<?>> ci = clusters.iterator();
    for (int i = 0; ci.hasNext(); i++) {
        Cluster<?> cluster = ci.next();
        if (cluster.size() <= 1) {
            // As suggested in Rousseeuw, we use 0 for singletons.
            mssil.put(0., cluster.size());
            continue;
        }
        if (cluster.isNoise()) {
            switch(noiseOption) {
                case IGNORE_NOISE:
                    // Ignore elements
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    // As suggested in Rousseeuw, we use 0 for singletons.
                    mssil.put(0., cluster.size());
                    continue;
                case MERGE_NOISE:
                    // Treat as cluster below
                    break;
            }
        }
        // Cluster center:
        final NumberVector center = centroids[i];
        assert (center != null);
        for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
            NumberVector obj = rel.get(it);
            // a: Distance to own centroid
            double a = distance.distance(center, obj);
            // b: Distance to other clusters centroids:
            double min = Double.POSITIVE_INFINITY;
            Iterator<? extends Cluster<?>> cj = clusters.iterator();
            for (int j = 0; cj.hasNext(); j++) {
                Cluster<?> ocluster = cj.next();
                if (i == j) {
                    continue;
                }
                NumberVector other = centroids[j];
                if (other == null) {
                    // Noise!
                    switch(noiseOption) {
                        case IGNORE_NOISE:
                            continue;
                        case TREAT_NOISE_AS_SINGLETONS:
                            // Treat each object like a centroid!
                            for (DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
                                double dist = distance.distance(rel.get(it2), obj);
                                min = dist < min ? dist : min;
                            }
                            continue;
                        case MERGE_NOISE:
                            // Treat as cluster below, but should not be reachable.
                            break;
                    }
                }
                // Clusters: use centroid.
                double dist = distance.distance(other, obj);
                min = dist < min ? dist : min;
            }
            // One 'real' cluster only?
            min = min < Double.POSITIVE_INFINITY ? min : a;
            mssil.put((min - a) / (min > a ? min : a));
        }
    }
    double penalty = 1.;
    // Only if {@link NoiseHandling#IGNORE_NOISE}:
    if (penalize && ignorednoise > 0) {
        penalty = (rel.size() - ignorednoise) / (double) rel.size();
    }
    final double meanssil = penalty * mssil.getMean();
    final double stdssil = penalty * mssil.getSampleStddev();
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".simplified-silhouette.noise-handling", noiseOption.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".simplified-silhouette.ignored", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.mean", meanssil));
        LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.stddev", stdssil));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("Simp. Silhouette +-" + FormatUtil.NF2.format(stdssil), meanssil, -1., 1., 0., false);
    db.getHierarchy().resultChanged(ev);
    return meanssil;
}
Also used : MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 65 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class EvaluateSquaredErrors method processNewResult.

@Override
public void processNewResult(ResultHierarchy hier, Result result) {
    List<Clustering<?>> crs = Clustering.getClusteringResults(result);
    if (crs.isEmpty()) {
        return;
    }
    Database db = ResultUtil.findDatabase(hier);
    Relation<NumberVector> rel = db.getRelation(distance.getInputTypeRestriction());
    for (Clustering<?> c : crs) {
        evaluateClustering(db, rel, c);
    }
}
Also used : NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) Database(de.lmu.ifi.dbs.elki.database.Database) Clustering(de.lmu.ifi.dbs.elki.data.Clustering)

Aggregations

NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)85 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)40 ArrayList (java.util.ArrayList)16 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)9 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)8 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)8 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)8 Database (de.lmu.ifi.dbs.elki.database.Database)7 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)7 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)7 Random (java.util.Random)7 Test (org.junit.Test)7 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)5 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)5 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)5 EvaluationResult (de.lmu.ifi.dbs.elki.result.EvaluationResult)5 MeasurementGroup (de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup)5 List (java.util.List)5 SparseNumberVector (de.lmu.ifi.dbs.elki.data.SparseNumberVector)4 RandomProjectionFamily (de.lmu.ifi.dbs.elki.data.projection.random.RandomProjectionFamily)4