Search in sources :

Example 1 with ObjHistogram

use of de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram in project elki by elki-project.

the class DistanceStatisticsWithClasses method run.

@Override
public HistogramResult run(Database database) {
    final Relation<O> relation = database.getRelation(getInputTypeRestriction()[0]);
    final DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
    final StepProgress stepprog = LOG.isVerbose() ? new StepProgress("Distance statistics", 2) : null;
    // determine binning ranges.
    DoubleMinMax gminmax = new DoubleMinMax();
    // Cluster by labels
    Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters();
    // global in-cluster min/max
    DoubleMinMax giminmax = new DoubleMinMax();
    // global other-cluster min/max
    DoubleMinMax gominmax = new DoubleMinMax();
    // in-cluster distances
    MeanVariance mimin = new MeanVariance();
    MeanVariance mimax = new MeanVariance();
    MeanVariance midif = new MeanVariance();
    // other-cluster distances
    MeanVariance momin = new MeanVariance();
    MeanVariance momax = new MeanVariance();
    MeanVariance modif = new MeanVariance();
    // Histogram
    final ObjHistogram<long[]> histogram;
    LOG.beginStep(stepprog, 1, "Prepare histogram.");
    if (exact) {
        gminmax = exactMinMax(relation, distFunc);
        histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2);
    } else if (sampling) {
        gminmax = sampleMinMax(relation, distFunc);
        histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2);
    } else {
        histogram = new AbstractObjDynamicHistogram<long[]>(numbin) {

            @Override
            protected long[] downsample(Object[] data, int start, int end, int size) {
                long[] ret = new long[2];
                for (int i = start; i < end; i++) {
                    long[] existing = (long[]) data[i];
                    if (existing != null) {
                        for (int c = 0; c < 2; c++) {
                            ret[c] += existing[c];
                        }
                    }
                }
                return ret;
            }

            @Override
            protected long[] aggregate(long[] first, long[] second) {
                for (int c = 0; c < 2; c++) {
                    first[c] += second[c];
                }
                return first;
            }

            @Override
            protected long[] cloneForCache(long[] data) {
                return data.clone();
            }

            @Override
            protected long[] makeObject() {
                return new long[2];
            }
        };
    }
    LOG.beginStep(stepprog, 2, "Build histogram.");
    final FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Distance computations", relation.size(), LOG) : null;
    // iterate per cluster
    final long[] incFirst = new long[] { 1L, 0L };
    final long[] incSecond = new long[] { 0L, 1L };
    for (Cluster<?> c1 : split) {
        for (DBIDIter id1 = c1.getIDs().iter(); id1.valid(); id1.advance()) {
            // in-cluster distances
            DoubleMinMax iminmax = new DoubleMinMax();
            for (DBIDIter iter2 = c1.getIDs().iter(); iter2.valid(); iter2.advance()) {
                // skip the point itself.
                if (DBIDUtil.equal(id1, iter2)) {
                    continue;
                }
                double d = distFunc.distance(id1, iter2);
                histogram.putData(d, incFirst);
                iminmax.put(d);
            }
            // aggregate
            mimin.put(iminmax.getMin());
            mimax.put(iminmax.getMax());
            midif.put(iminmax.getDiff());
            // min/max
            giminmax.put(iminmax.getMin());
            giminmax.put(iminmax.getMax());
            // other-cluster distances
            DoubleMinMax ominmax = new DoubleMinMax();
            for (Cluster<?> c2 : split) {
                if (c2 == c1) {
                    continue;
                }
                for (DBIDIter iter2 = c2.getIDs().iter(); iter2.valid(); iter2.advance()) {
                    // skip the point itself (shouldn't happen though)
                    if (DBIDUtil.equal(id1, iter2)) {
                        continue;
                    }
                    double d = distFunc.distance(id1, iter2);
                    histogram.putData(d, incSecond);
                    ominmax.put(d);
                }
            }
            // aggregate
            momin.put(ominmax.getMin());
            momax.put(ominmax.getMax());
            modif.put(ominmax.getDiff());
            // min/max
            gominmax.put(ominmax.getMin());
            gominmax.put(ominmax.getMax());
            LOG.incrementProcessed(progress);
        }
    }
    LOG.ensureCompleted(progress);
    // Update values (only needed for sampling case).
    gminmax.put(gominmax);
    LOG.setCompleted(stepprog);
    // count the number of samples we have in the data
    long inum = 0;
    long onum = 0;
    for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
        inum += iter.getValue()[0];
        onum += iter.getValue()[1];
    }
    long bnum = inum + onum;
    Collection<double[]> binstat = new ArrayList<>(numbin);
    for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
        final long[] value = iter.getValue();
        final double icof = (inum == 0) ? 0 : ((double) value[0]) / inum / histogram.getBinsize();
        final double icaf = ((double) value[0]) / bnum / histogram.getBinsize();
        final double ocof = (onum == 0) ? 0 : ((double) value[1]) / onum / histogram.getBinsize();
        final double ocaf = ((double) value[1]) / bnum / histogram.getBinsize();
        binstat.add(new double[] { iter.getCenter(), icof, icaf, ocof, ocaf });
    }
    HistogramResult result = new HistogramResult("Distance Histogram", "distance-histogram", binstat);
    result.addHeader("Absolute minimum distance (abs): " + gminmax.getMin());
    result.addHeader("Absolute maximum distance (abs): " + gminmax.getMax());
    result.addHeader("In-Cluster minimum distance (abs, avg, stddev): " + giminmax.getMin() + " " + mimin.getMean() + " " + mimin.getSampleStddev());
    result.addHeader("In-Cluster maximum distance (abs, avg, stddev): " + giminmax.getMax() + " " + mimax.getMean() + " " + mimax.getSampleStddev());
    result.addHeader("Other-Cluster minimum distance (abs, avg, stddev): " + gominmax.getMin() + " " + momin.getMean() + " " + momin.getSampleStddev());
    result.addHeader("Other-Cluster maximum distance (abs, avg, stddev): " + gominmax.getMax() + " " + momax.getMean() + " " + momax.getSampleStddev());
    result.addHeader("Column description: bin center, in-cluster only frequency, in-cluster all frequency, other-cluster only frequency, other cluster all frequency");
    result.addHeader("In-cluster value count: " + inum + " other cluster value count: " + onum);
    return result;
}
Also used : ObjHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram) HistogramResult(de.lmu.ifi.dbs.elki.result.HistogramResult) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) AbstractObjDynamicHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.AbstractObjDynamicHistogram) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) ByLabelOrAllInOneClustering(de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) LongArrayStaticHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.LongArrayStaticHistogram) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) DoubleMinMax(de.lmu.ifi.dbs.elki.math.DoubleMinMax)

Example 2 with ObjHistogram

use of de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram in project elki by elki-project.

the class ComputeOutlierHistogram method evaluateOutlierResult.

/**
 * Evaluate a single outlier result as histogram.
 *
 * @param database Database to process
 * @param or Outlier result
 * @return Result
 */
public HistogramResult evaluateOutlierResult(Database database, OutlierResult or) {
    if (scaling instanceof OutlierScalingFunction) {
        OutlierScalingFunction oscaling = (OutlierScalingFunction) scaling;
        oscaling.prepare(or);
    }
    ModifiableDBIDs ids = DBIDUtil.newHashSet(or.getScores().getDBIDs());
    DBIDs outlierIds = DatabaseUtil.getObjectsByLabelMatch(database, positiveClassName);
    // first value for outliers, second for each object
    // If we have useful (finite) min/max, use these for binning.
    double min = scaling.getMin();
    double max = scaling.getMax();
    final ObjHistogram<DoubleDoublePair> hist;
    if (Double.isInfinite(min) || Double.isNaN(min) || Double.isInfinite(max) || Double.isNaN(max)) {
        hist = new AbstractObjDynamicHistogram<DoubleDoublePair>(bins) {

            @Override
            public DoubleDoublePair aggregate(DoubleDoublePair first, DoubleDoublePair second) {
                first.first += second.first;
                first.second += second.second;
                return first;
            }

            @Override
            protected DoubleDoublePair makeObject() {
                return new DoubleDoublePair(0., 0.);
            }

            @Override
            protected DoubleDoublePair cloneForCache(DoubleDoublePair data) {
                return new DoubleDoublePair(data.first, data.second);
            }

            @Override
            protected DoubleDoublePair downsample(Object[] data, int start, int end, int size) {
                DoubleDoublePair sum = new DoubleDoublePair(0, 0);
                for (int i = start; i < end; i++) {
                    DoubleDoublePair p = (DoubleDoublePair) data[i];
                    if (p != null) {
                        sum.first += p.first;
                        sum.second += p.second;
                    }
                }
                return sum;
            }
        };
    } else {
        hist = new AbstractObjStaticHistogram<DoubleDoublePair>(bins, min, max) {

            @Override
            protected DoubleDoublePair makeObject() {
                return new DoubleDoublePair(0., 0.);
            }

            @Override
            public void putData(double coord, DoubleDoublePair data) {
                DoubleDoublePair exist = get(coord);
                exist.first += data.first;
                exist.second += data.second;
            }
        };
    }
    // first fill histogram only with values of outliers
    DoubleDoublePair negative, positive;
    if (!splitfreq) {
        negative = new DoubleDoublePair(1. / ids.size(), 0);
        positive = new DoubleDoublePair(0, 1. / ids.size());
    } else {
        negative = new DoubleDoublePair(1. / (ids.size() - outlierIds.size()), 0);
        positive = new DoubleDoublePair(0, 1. / outlierIds.size());
    }
    ids.removeDBIDs(outlierIds);
    // fill histogram with values of each object
    for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
        double result = or.getScores().doubleValue(iter);
        result = scaling.getScaled(result);
        if (result > Double.NEGATIVE_INFINITY && result < Double.POSITIVE_INFINITY) {
            hist.putData(result, negative);
        }
    }
    for (DBIDIter iter = outlierIds.iter(); iter.valid(); iter.advance()) {
        double result = or.getScores().doubleValue(iter);
        result = scaling.getScaled(result);
        if (result > Double.NEGATIVE_INFINITY && result < Double.POSITIVE_INFINITY) {
            hist.putData(result, positive);
        }
    }
    Collection<double[]> collHist = new ArrayList<>(hist.getNumBins());
    for (ObjHistogram.Iter<DoubleDoublePair> iter = hist.iter(); iter.valid(); iter.advance()) {
        DoubleDoublePair data = iter.getValue();
        collHist.add(new double[] { iter.getCenter(), data.first, data.second });
    }
    return new HistogramResult("Outlier Score Histogram", "outlier-histogram", collHist);
}
Also used : ObjHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram) HistogramResult(de.lmu.ifi.dbs.elki.result.HistogramResult) OutlierScalingFunction(de.lmu.ifi.dbs.elki.utilities.scaling.outlier.OutlierScalingFunction) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) ArrayList(java.util.ArrayList) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleDoublePair(de.lmu.ifi.dbs.elki.utilities.pairs.DoubleDoublePair) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Aggregations

HistogramResult (de.lmu.ifi.dbs.elki.result.HistogramResult)2 ObjHistogram (de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram)2 ByLabelOrAllInOneClustering (de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering)1 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)1 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)1 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)1 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)1 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)1 StepProgress (de.lmu.ifi.dbs.elki.logging.progress.StepProgress)1 DoubleMinMax (de.lmu.ifi.dbs.elki.math.DoubleMinMax)1 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)1 AbstractObjDynamicHistogram (de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.AbstractObjDynamicHistogram)1 LongArrayStaticHistogram (de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.LongArrayStaticHistogram)1 DoubleDoublePair (de.lmu.ifi.dbs.elki.utilities.pairs.DoubleDoublePair)1 OutlierScalingFunction (de.lmu.ifi.dbs.elki.utilities.scaling.outlier.OutlierScalingFunction)1 ArrayList (java.util.ArrayList)1