Search in sources :

Example 1 with DoubleMaxHeap

use of de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMaxHeap in project elki by elki-project.

the class DistanceQuantileSampler method run.

/**
 * Run the distance quantile sampler.
 *
 * @param database
 * @param rel
 * @return Distances sample
 */
public CollectionResult<double[]> run(Database database, Relation<O> rel) {
    DistanceQuery<O> dq = rel.getDistanceQuery(getDistanceFunction());
    int size = rel.size();
    long pairs = (size * (long) size) >> 1;
    final long ssize = sampling <= 1 ? (long) Math.ceil(sampling * pairs) : (long) sampling;
    if (ssize > Integer.MAX_VALUE) {
        throw new AbortException("Sampling size too large.");
    }
    final int qsize = quantile <= 0 ? 1 : (int) Math.ceil(quantile * ssize);
    DoubleMaxHeap heap = new DoubleMaxHeap(qsize);
    ArrayDBIDs ids = DBIDUtil.ensureArray(rel.getDBIDs());
    DBIDArrayIter i1 = ids.iter(), i2 = ids.iter();
    Random r = rand.getSingleThreadedRandom();
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Sampling", (int) ssize, LOG) : null;
    for (long i = 0; i < ssize; i++) {
        int x = r.nextInt(size - 1) + 1, y = r.nextInt(x);
        double dist = dq.distance(i1.seek(x), i2.seek(y));
        // Skip NaN, and/or zeros.
        if (dist != dist || (nozeros && dist < Double.MIN_NORMAL)) {
            continue;
        }
        heap.add(dist, qsize);
        LOG.incrementProcessed(prog);
    }
    LOG.statistics(new DoubleStatistic(PREFIX + ".quantile", quantile));
    LOG.statistics(new LongStatistic(PREFIX + ".samplesize", ssize));
    LOG.statistics(new DoubleStatistic(PREFIX + ".distance", heap.peek()));
    LOG.ensureCompleted(prog);
    Collection<String> header = Arrays.asList(new String[] { "Distance" });
    Collection<double[]> data = Arrays.asList(new double[][] { new double[] { heap.peek() } });
    return new CollectionResult<double[]>("Distances sample", "distance-sample", data, header);
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) DoubleMaxHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMaxHeap) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) CollectionResult(de.lmu.ifi.dbs.elki.result.CollectionResult) Random(java.util.Random) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 2 with DoubleMaxHeap

use of de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMaxHeap in project elki by elki-project.

the class EvaluateCIndex method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return C-Index
 */
public double evaluateClustering(Database db, Relation<? extends O> rel, DistanceQuery<O> dq, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    // Count ignored noise, and within-cluster distances
    int ignorednoise = 0, w = 0;
    for (Cluster<?> cluster : clusters) {
        if (cluster.size() <= 1 || cluster.isNoise()) {
            switch(noiseOption) {
                case IGNORE_NOISE:
                    ignorednoise += cluster.size();
                    // Ignore
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    // No within-cluster distances!
                    continue;
                case MERGE_NOISE:
                    // Treat like a cluster
                    break;
                default:
                    LOG.warning("Unknown noise handling option: " + noiseOption);
            }
        }
        w += (cluster.size() * (cluster.size() - 1)) >>> 1;
    }
    // TODO: for small k=2, and balanced clusters, it may be more efficient to
    // just build a long array with all distances, and select the quantiles.
    // The heaps used below pay off in memory consumption for k > 2
    // Yes, maxDists is supposed to be a min heap, and the other way.
    // Because we want to replace the smallest of the current k-largest
    // distances.
    DoubleHeap maxDists = new DoubleMinHeap(w);
    DoubleHeap minDists = new DoubleMaxHeap(w);
    // Sum of within-cluster distances
    double theta = 0.;
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Processing clusters for C-Index", clusters.size(), LOG) : null;
    for (int i = 0; i < clusters.size(); i++) {
        Cluster<?> cluster = clusters.get(i);
        if (cluster.size() <= 1 || cluster.isNoise()) {
            switch(noiseOption) {
                case IGNORE_NOISE:
                    LOG.incrementProcessed(prog);
                    // Ignore
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    processSingleton(cluster, rel, dq, maxDists, minDists, w);
                    LOG.incrementProcessed(prog);
                    continue;
                case MERGE_NOISE:
                    // Treat like a cluster, below
                    break;
            }
        }
        theta += processCluster(cluster, clusters, i, dq, maxDists, minDists, w);
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    // Simulate best and worst cases:
    // Sum of largest and smallest
    double min = 0, max = 0;
    assert (minDists.size() == w);
    assert (maxDists.size() == w);
    for (DoubleHeap.UnsortedIter it = minDists.unsortedIter(); it.valid(); it.advance()) {
        min += it.get();
    }
    for (DoubleHeap.UnsortedIter it = maxDists.unsortedIter(); it.valid(); it.advance()) {
        max += it.get();
    }
    assert (max >= min);
    double cIndex = (max > min) ? (theta - min) / (max - min) : 1.;
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".c-index.noise-handling", noiseOption.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".c-index.ignored", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".c-index", cIndex));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("C-Index", cIndex, 0., 1., 0., true);
    db.getHierarchy().resultChanged(ev);
    return cIndex;
}
Also used : DoubleMinHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DoubleHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleHeap) MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DoubleMaxHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMaxHeap) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Aggregations

FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)2 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)2 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)2 DoubleMaxHeap (de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMaxHeap)2 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)1 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)1 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)1 CollectionResult (de.lmu.ifi.dbs.elki.result.CollectionResult)1 EvaluationResult (de.lmu.ifi.dbs.elki.result.EvaluationResult)1 MeasurementGroup (de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup)1 DoubleHeap (de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleHeap)1 DoubleMinHeap (de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap)1 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)1 Random (java.util.Random)1