Search in sources :

Example 1 with ByLabelOrAllInOneClustering

use of de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering in project elki by elki-project.

the class DistanceStatisticsWithClasses method run.

@Override
public HistogramResult run(Database database) {
    final Relation<O> relation = database.getRelation(getInputTypeRestriction()[0]);
    final DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
    final StepProgress stepprog = LOG.isVerbose() ? new StepProgress("Distance statistics", 2) : null;
    // determine binning ranges.
    DoubleMinMax gminmax = new DoubleMinMax();
    // Cluster by labels
    Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters();
    // global in-cluster min/max
    DoubleMinMax giminmax = new DoubleMinMax();
    // global other-cluster min/max
    DoubleMinMax gominmax = new DoubleMinMax();
    // in-cluster distances
    MeanVariance mimin = new MeanVariance();
    MeanVariance mimax = new MeanVariance();
    MeanVariance midif = new MeanVariance();
    // other-cluster distances
    MeanVariance momin = new MeanVariance();
    MeanVariance momax = new MeanVariance();
    MeanVariance modif = new MeanVariance();
    // Histogram
    final ObjHistogram<long[]> histogram;
    LOG.beginStep(stepprog, 1, "Prepare histogram.");
    if (exact) {
        gminmax = exactMinMax(relation, distFunc);
        histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2);
    } else if (sampling) {
        gminmax = sampleMinMax(relation, distFunc);
        histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2);
    } else {
        histogram = new AbstractObjDynamicHistogram<long[]>(numbin) {

            @Override
            protected long[] downsample(Object[] data, int start, int end, int size) {
                long[] ret = new long[2];
                for (int i = start; i < end; i++) {
                    long[] existing = (long[]) data[i];
                    if (existing != null) {
                        for (int c = 0; c < 2; c++) {
                            ret[c] += existing[c];
                        }
                    }
                }
                return ret;
            }

            @Override
            protected long[] aggregate(long[] first, long[] second) {
                for (int c = 0; c < 2; c++) {
                    first[c] += second[c];
                }
                return first;
            }

            @Override
            protected long[] cloneForCache(long[] data) {
                return data.clone();
            }

            @Override
            protected long[] makeObject() {
                return new long[2];
            }
        };
    }
    LOG.beginStep(stepprog, 2, "Build histogram.");
    final FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Distance computations", relation.size(), LOG) : null;
    // iterate per cluster
    final long[] incFirst = new long[] { 1L, 0L };
    final long[] incSecond = new long[] { 0L, 1L };
    for (Cluster<?> c1 : split) {
        for (DBIDIter id1 = c1.getIDs().iter(); id1.valid(); id1.advance()) {
            // in-cluster distances
            DoubleMinMax iminmax = new DoubleMinMax();
            for (DBIDIter iter2 = c1.getIDs().iter(); iter2.valid(); iter2.advance()) {
                // skip the point itself.
                if (DBIDUtil.equal(id1, iter2)) {
                    continue;
                }
                double d = distFunc.distance(id1, iter2);
                histogram.putData(d, incFirst);
                iminmax.put(d);
            }
            // aggregate
            mimin.put(iminmax.getMin());
            mimax.put(iminmax.getMax());
            midif.put(iminmax.getDiff());
            // min/max
            giminmax.put(iminmax.getMin());
            giminmax.put(iminmax.getMax());
            // other-cluster distances
            DoubleMinMax ominmax = new DoubleMinMax();
            for (Cluster<?> c2 : split) {
                if (c2 == c1) {
                    continue;
                }
                for (DBIDIter iter2 = c2.getIDs().iter(); iter2.valid(); iter2.advance()) {
                    // skip the point itself (shouldn't happen though)
                    if (DBIDUtil.equal(id1, iter2)) {
                        continue;
                    }
                    double d = distFunc.distance(id1, iter2);
                    histogram.putData(d, incSecond);
                    ominmax.put(d);
                }
            }
            // aggregate
            momin.put(ominmax.getMin());
            momax.put(ominmax.getMax());
            modif.put(ominmax.getDiff());
            // min/max
            gominmax.put(ominmax.getMin());
            gominmax.put(ominmax.getMax());
            LOG.incrementProcessed(progress);
        }
    }
    LOG.ensureCompleted(progress);
    // Update values (only needed for sampling case).
    gminmax.put(gominmax);
    LOG.setCompleted(stepprog);
    // count the number of samples we have in the data
    long inum = 0;
    long onum = 0;
    for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
        inum += iter.getValue()[0];
        onum += iter.getValue()[1];
    }
    long bnum = inum + onum;
    Collection<double[]> binstat = new ArrayList<>(numbin);
    for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
        final long[] value = iter.getValue();
        final double icof = (inum == 0) ? 0 : ((double) value[0]) / inum / histogram.getBinsize();
        final double icaf = ((double) value[0]) / bnum / histogram.getBinsize();
        final double ocof = (onum == 0) ? 0 : ((double) value[1]) / onum / histogram.getBinsize();
        final double ocaf = ((double) value[1]) / bnum / histogram.getBinsize();
        binstat.add(new double[] { iter.getCenter(), icof, icaf, ocof, ocaf });
    }
    HistogramResult result = new HistogramResult("Distance Histogram", "distance-histogram", binstat);
    result.addHeader("Absolute minimum distance (abs): " + gminmax.getMin());
    result.addHeader("Absolute maximum distance (abs): " + gminmax.getMax());
    result.addHeader("In-Cluster minimum distance (abs, avg, stddev): " + giminmax.getMin() + " " + mimin.getMean() + " " + mimin.getSampleStddev());
    result.addHeader("In-Cluster maximum distance (abs, avg, stddev): " + giminmax.getMax() + " " + mimax.getMean() + " " + mimax.getSampleStddev());
    result.addHeader("Other-Cluster minimum distance (abs, avg, stddev): " + gominmax.getMin() + " " + momin.getMean() + " " + momin.getSampleStddev());
    result.addHeader("Other-Cluster maximum distance (abs, avg, stddev): " + gominmax.getMax() + " " + momax.getMean() + " " + momax.getSampleStddev());
    result.addHeader("Column description: bin center, in-cluster only frequency, in-cluster all frequency, other-cluster only frequency, other cluster all frequency");
    result.addHeader("In-cluster value count: " + inum + " other cluster value count: " + onum);
    return result;
}
Also used : ObjHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram) HistogramResult(de.lmu.ifi.dbs.elki.result.HistogramResult) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) AbstractObjDynamicHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.AbstractObjDynamicHistogram) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) ByLabelOrAllInOneClustering(de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) LongArrayStaticHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.LongArrayStaticHistogram) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) DoubleMinMax(de.lmu.ifi.dbs.elki.math.DoubleMinMax)

Example 2 with ByLabelOrAllInOneClustering

use of de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering in project elki by elki-project.

the class RankingQualityHistogram method run.

/**
 * Process a database
 *
 * @param database Database to process
 * @param relation Relation to process
 * @return Histogram of ranking qualities
 */
public HistogramResult run(Database database, Relation<O> relation) {
    final DistanceQuery<O> distanceQuery = database.getDistanceQuery(relation, getDistanceFunction());
    final KNNQuery<O> knnQuery = database.getKNNQuery(distanceQuery, relation.size());
    if (LOG.isVerbose()) {
        LOG.verbose("Preprocessing clusters...");
    }
    // Cluster by labels
    Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters();
    DoubleStaticHistogram hist = new DoubleStaticHistogram(numbins, 0.0, 1.0);
    if (LOG.isVerbose()) {
        LOG.verbose("Processing points...");
    }
    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG) : null;
    ROCEvaluation roc = new ROCEvaluation();
    MeanVariance mv = new MeanVariance();
    // sort neighbors
    for (Cluster<?> clus : split) {
        for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) {
            KNNList knn = knnQuery.getKNNForDBID(iter, relation.size());
            double result = EvaluateClustering.evaluateRanking(roc, clus, knn);
            mv.put(result);
            hist.increment(result, 1. / relation.size());
            LOG.incrementProcessed(progress);
        }
    }
    LOG.ensureCompleted(progress);
    // Transform Histogram into a Double Vector array.
    Collection<double[]> res = new ArrayList<>(relation.size());
    for (DoubleStaticHistogram.Iter iter = hist.iter(); iter.valid(); iter.advance()) {
        res.add(new double[] { iter.getCenter(), iter.getValue() });
    }
    HistogramResult result = new HistogramResult("Ranking Quality Histogram", "ranking-histogram", res);
    result.addHeader("Mean: " + mv.getMean() + " Variance: " + mv.getSampleVariance());
    return result;
}
Also used : ROCEvaluation(de.lmu.ifi.dbs.elki.evaluation.scores.ROCEvaluation) HistogramResult(de.lmu.ifi.dbs.elki.result.HistogramResult) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) ArrayList(java.util.ArrayList) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) ByLabelOrAllInOneClustering(de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) KNNList(de.lmu.ifi.dbs.elki.database.ids.KNNList) DoubleStaticHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.DoubleStaticHistogram)

Example 3 with ByLabelOrAllInOneClustering

use of de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering in project elki by elki-project.

the class EvaluateRankingQuality method run.

@Override
public HistogramResult run(Database database) {
    final Relation<V> relation = database.getRelation(getInputTypeRestriction()[0]);
    final DistanceQuery<V> distQuery = database.getDistanceQuery(relation, getDistanceFunction());
    final KNNQuery<V> knnQuery = database.getKNNQuery(distQuery, relation.size());
    if (LOG.isVerbose()) {
        LOG.verbose("Preprocessing clusters...");
    }
    // Cluster by labels
    Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters();
    // Compute cluster averages and covariance matrix
    HashMap<Cluster<?>, double[]> averages = new HashMap<>(split.size());
    HashMap<Cluster<?>, double[][]> covmats = new HashMap<>(split.size());
    for (Cluster<?> clus : split) {
        CovarianceMatrix covmat = CovarianceMatrix.make(relation, clus.getIDs());
        averages.put(clus, covmat.getMeanVector());
        covmats.put(clus, covmat.destroyToPopulationMatrix());
    }
    MeanVarianceStaticHistogram hist = new MeanVarianceStaticHistogram(numbins, 0.0, 1.0);
    if (LOG.isVerbose()) {
        LOG.verbose("Processing points...");
    }
    FiniteProgress rocloop = LOG.isVerbose() ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG) : null;
    ROCEvaluation roc = new ROCEvaluation();
    // sort neighbors
    for (Cluster<?> clus : split) {
        ModifiableDoubleDBIDList cmem = DBIDUtil.newDistanceDBIDList(clus.size());
        double[] av = averages.get(clus);
        double[][] covm = covmats.get(clus);
        for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) {
            double d = mahalanobisDistance(covm, relation.get(iter).toArray(), av);
            cmem.add(d, iter);
        }
        cmem.sort();
        for (DBIDArrayIter it = cmem.iter(); it.valid(); it.advance()) {
            KNNList knn = knnQuery.getKNNForDBID(it, relation.size());
            double result = EvaluateClustering.evaluateRanking(roc, clus, knn);
            hist.put(((double) it.getOffset()) / clus.size(), result);
            LOG.incrementProcessed(rocloop);
        }
    }
    LOG.ensureCompleted(rocloop);
    // Collections.sort(results);
    // Transform Histogram into a Double Vector array.
    Collection<double[]> res = new ArrayList<>(relation.size());
    for (ObjHistogram.Iter<MeanVariance> iter = hist.iter(); iter.valid(); iter.advance()) {
        res.add(new double[] { iter.getCenter(), iter.getValue().getCount(), iter.getValue().getMean(), iter.getValue().getSampleVariance() });
    }
    return new HistogramResult("Ranking Quality Histogram", "ranking-histogram", res);
}
Also used : ObjHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram) HistogramResult(de.lmu.ifi.dbs.elki.result.HistogramResult) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) ROCEvaluation(de.lmu.ifi.dbs.elki.evaluation.scores.ROCEvaluation) ModifiableDoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) MeanVarianceStaticHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.MeanVarianceStaticHistogram) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) ByLabelOrAllInOneClustering(de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) KNNList(de.lmu.ifi.dbs.elki.database.ids.KNNList)

Aggregations

ByLabelOrAllInOneClustering (de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering)3 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)3 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)3 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)3 HistogramResult (de.lmu.ifi.dbs.elki.result.HistogramResult)3 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)2 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)2 ROCEvaluation (de.lmu.ifi.dbs.elki.evaluation.scores.ROCEvaluation)2 ObjHistogram (de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram)2 ArrayList (java.util.ArrayList)2 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)1 ModifiableDoubleDBIDList (de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList)1 StepProgress (de.lmu.ifi.dbs.elki.logging.progress.StepProgress)1 DoubleMinMax (de.lmu.ifi.dbs.elki.math.DoubleMinMax)1 CovarianceMatrix (de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix)1 AbstractObjDynamicHistogram (de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.AbstractObjDynamicHistogram)1 DoubleStaticHistogram (de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.DoubleStaticHistogram)1 LongArrayStaticHistogram (de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.LongArrayStaticHistogram)1 MeanVarianceStaticHistogram (de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.MeanVarianceStaticHistogram)1 HashMap (java.util.HashMap)1