Search in sources :

Example 11 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class CTLuZTestOutlier method run.

/**
 * Main method.
 *
 * @param database Database
 * @param nrel Neighborhood relation
 * @param relation Data relation (1d!)
 * @return Outlier detection result
 */
public OutlierResult run(Database database, Relation<N> nrel, Relation<? extends NumberVector> relation) {
    final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(database, nrel);
    WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    MeanVariance zmv = new MeanVariance();
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        DBIDs neighbors = npred.getNeighborDBIDs(iditer);
        // Compute Mean of neighborhood
        Mean localmean = new Mean();
        for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) {
            if (DBIDUtil.equal(iditer, iter)) {
                continue;
            }
            localmean.put(relation.get(iter).doubleValue(0));
        }
        final double localdiff;
        if (localmean.getCount() > 0) {
            localdiff = relation.get(iditer).doubleValue(0) - localmean.getMean();
        } else {
            localdiff = 0.0;
        }
        scores.putDouble(iditer, localdiff);
        zmv.put(localdiff);
    }
    // Normalize scores using mean and variance
    DoubleMinMax minmax = new DoubleMinMax();
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        double score = Math.abs(scores.doubleValue(iditer) - zmv.getMean()) / zmv.getSampleStddev();
        minmax.put(score);
        scores.putDouble(iditer, score);
    }
    // Wrap result
    DoubleRelation scoreResult = new MaterializedDoubleRelation("ZTest", "Z Test score", scores, relation.getDBIDs());
    OutlierScoreMeta scoreMeta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0);
    OutlierResult or = new OutlierResult(scoreMeta, scoreResult);
    or.addChildResult(npred);
    return or;
}
Also used : Mean(de.lmu.ifi.dbs.elki.math.Mean) WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) OutlierResult(de.lmu.ifi.dbs.elki.result.outlier.OutlierResult) NeighborSetPredicate(de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood.NeighborSetPredicate) DoubleRelation(de.lmu.ifi.dbs.elki.database.relation.DoubleRelation) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation) BasicOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta) OutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta) BasicOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) DoubleMinMax(de.lmu.ifi.dbs.elki.math.DoubleMinMax) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)

Example 12 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class SpatialApproximationMaterializeKNNPreprocessor method preprocess.

@Override
protected void preprocess() {
    DistanceQuery<O> distanceQuery = relation.getDistanceQuery(distanceFunction);
    SpatialIndexTree<N, E> index = getSpatialIndex(relation);
    storage = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, KNNList.class);
    MeanVariance pagesize = new MeanVariance();
    MeanVariance ksize = new MeanVariance();
    final Logging log = getLogger();
    if (log.isVerbose()) {
        log.verbose("Approximating nearest neighbor lists to database objects");
    }
    List<E> leaves = index.getLeaves();
    FiniteProgress progress = log.isVerbose() ? new FiniteProgress("Processing leaf nodes", leaves.size(), log) : null;
    for (E leaf : leaves) {
        N node = index.getNode(leaf);
        int size = node.getNumEntries();
        pagesize.put(size);
        if (log.isDebuggingFinest()) {
            log.debugFinest("NumEntires = " + size);
        }
        // Collect the ids in this node.
        ArrayModifiableDBIDs ids = DBIDUtil.newArray(size);
        for (int i = 0; i < size; i++) {
            ids.add(((LeafEntry) node.getEntry(i)).getDBID());
        }
        Object2DoubleOpenHashMap<DBIDPair> cache = new Object2DoubleOpenHashMap<>((size * size * 3) >> 3);
        for (DBIDIter id = ids.iter(); id.valid(); id.advance()) {
            KNNHeap kNN = DBIDUtil.newHeap(k);
            for (DBIDIter id2 = ids.iter(); id2.valid(); id2.advance()) {
                DBIDPair key = DBIDUtil.newPair(id, id2);
                double d = cache.removeDouble(key);
                if (d == d) {
                    // Not NaN
                    // consume the previous result.
                    kNN.insert(d, id2);
                } else {
                    // compute new and store the previous result.
                    d = distanceQuery.distance(id, id2);
                    kNN.insert(d, id2);
                    // put it into the cache, but with the keys reversed
                    key = DBIDUtil.newPair(id2, id);
                    cache.put(key, d);
                }
            }
            ksize.put(kNN.size());
            storage.put(id, kNN.toKNNList());
        }
        if (log.isDebugging() && cache.size() > 0) {
            log.warning("Cache should be empty after each run, but still has " + cache.size() + " elements.");
        }
        log.incrementProcessed(progress);
    }
    log.ensureCompleted(progress);
    if (log.isVerbose()) {
        log.verbose("Average page size = " + pagesize.getMean() + " +- " + pagesize.getSampleStddev());
        log.verbose("On average, " + ksize.getMean() + " +- " + ksize.getSampleStddev() + " neighbors returned.");
    }
}
Also used : Logging(de.lmu.ifi.dbs.elki.logging.Logging) Object2DoubleOpenHashMap(it.unimi.dsi.fastutil.objects.Object2DoubleOpenHashMap) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance)

Example 13 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class PerplexityAffinityMatrixBuilder method computePij.

/**
 * Compute the pij from the distance matrix.
 *
 * @param dist Distance matrix.
 * @param perplexity Desired perplexity
 * @param initialScale Initial scale
 * @return Affinity matrix pij
 */
protected static double[][] computePij(double[][] dist, double perplexity, double initialScale) {
    final int size = dist.length;
    final double logPerp = FastMath.log(perplexity);
    double[][] pij = new double[size][size];
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Optimizing perplexities", size, LOG) : null;
    Duration timer = LOG.isStatistics() ? LOG.newDuration(PerplexityAffinityMatrixBuilder.class.getName() + ".runtime.pijmatrix").begin() : null;
    MeanVariance mv = LOG.isStatistics() ? new MeanVariance() : null;
    for (int i = 0; i < size; i++) {
        double beta = computePi(i, dist[i], pij[i], perplexity, logPerp);
        if (mv != null) {
            // Sigma
            mv.put(beta > 0 ? FastMath.sqrt(.5 / beta) : 0.);
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    if (LOG.isStatistics()) {
        // timer != null, mv != null
        LOG.statistics(timer.end());
        LOG.statistics(new DoubleStatistic(PerplexityAffinityMatrixBuilder.class.getName() + ".sigma.average", mv.getMean()));
        LOG.statistics(new DoubleStatistic(PerplexityAffinityMatrixBuilder.class.getName() + ".sigma.stddev", mv.getSampleStddev()));
    }
    // Scale pij to have the desired sum EARLY_EXAGGERATION
    double sum = 0.;
    for (int i = 1; i < size; i++) {
        final double[] pij_i = pij[i];
        for (int j = 0; j < i; j++) {
            // Nur über halbe Matrix!
            // Symmetrie herstellen
            sum += (pij_i[j] += pij[j][i]);
        }
    }
    // Scaling taken from original tSNE code:
    final double scale = initialScale / (2. * sum);
    for (int i = 1; i < size; i++) {
        final double[] pij_i = pij[i];
        for (int j = 0; j < i; j++) {
            pij_i[j] = pij[j][i] = MathUtil.max(pij_i[j] * scale, MIN_PIJ);
        }
    }
    return pij;
}
Also used : DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration)

Example 14 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class DistanceStatisticsWithClasses method run.

@Override
public HistogramResult run(Database database) {
    final Relation<O> relation = database.getRelation(getInputTypeRestriction()[0]);
    final DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
    final StepProgress stepprog = LOG.isVerbose() ? new StepProgress("Distance statistics", 2) : null;
    // determine binning ranges.
    DoubleMinMax gminmax = new DoubleMinMax();
    // Cluster by labels
    Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters();
    // global in-cluster min/max
    DoubleMinMax giminmax = new DoubleMinMax();
    // global other-cluster min/max
    DoubleMinMax gominmax = new DoubleMinMax();
    // in-cluster distances
    MeanVariance mimin = new MeanVariance();
    MeanVariance mimax = new MeanVariance();
    MeanVariance midif = new MeanVariance();
    // other-cluster distances
    MeanVariance momin = new MeanVariance();
    MeanVariance momax = new MeanVariance();
    MeanVariance modif = new MeanVariance();
    // Histogram
    final ObjHistogram<long[]> histogram;
    LOG.beginStep(stepprog, 1, "Prepare histogram.");
    if (exact) {
        gminmax = exactMinMax(relation, distFunc);
        histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2);
    } else if (sampling) {
        gminmax = sampleMinMax(relation, distFunc);
        histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2);
    } else {
        histogram = new AbstractObjDynamicHistogram<long[]>(numbin) {

            @Override
            protected long[] downsample(Object[] data, int start, int end, int size) {
                long[] ret = new long[2];
                for (int i = start; i < end; i++) {
                    long[] existing = (long[]) data[i];
                    if (existing != null) {
                        for (int c = 0; c < 2; c++) {
                            ret[c] += existing[c];
                        }
                    }
                }
                return ret;
            }

            @Override
            protected long[] aggregate(long[] first, long[] second) {
                for (int c = 0; c < 2; c++) {
                    first[c] += second[c];
                }
                return first;
            }

            @Override
            protected long[] cloneForCache(long[] data) {
                return data.clone();
            }

            @Override
            protected long[] makeObject() {
                return new long[2];
            }
        };
    }
    LOG.beginStep(stepprog, 2, "Build histogram.");
    final FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Distance computations", relation.size(), LOG) : null;
    // iterate per cluster
    final long[] incFirst = new long[] { 1L, 0L };
    final long[] incSecond = new long[] { 0L, 1L };
    for (Cluster<?> c1 : split) {
        for (DBIDIter id1 = c1.getIDs().iter(); id1.valid(); id1.advance()) {
            // in-cluster distances
            DoubleMinMax iminmax = new DoubleMinMax();
            for (DBIDIter iter2 = c1.getIDs().iter(); iter2.valid(); iter2.advance()) {
                // skip the point itself.
                if (DBIDUtil.equal(id1, iter2)) {
                    continue;
                }
                double d = distFunc.distance(id1, iter2);
                histogram.putData(d, incFirst);
                iminmax.put(d);
            }
            // aggregate
            mimin.put(iminmax.getMin());
            mimax.put(iminmax.getMax());
            midif.put(iminmax.getDiff());
            // min/max
            giminmax.put(iminmax.getMin());
            giminmax.put(iminmax.getMax());
            // other-cluster distances
            DoubleMinMax ominmax = new DoubleMinMax();
            for (Cluster<?> c2 : split) {
                if (c2 == c1) {
                    continue;
                }
                for (DBIDIter iter2 = c2.getIDs().iter(); iter2.valid(); iter2.advance()) {
                    // skip the point itself (shouldn't happen though)
                    if (DBIDUtil.equal(id1, iter2)) {
                        continue;
                    }
                    double d = distFunc.distance(id1, iter2);
                    histogram.putData(d, incSecond);
                    ominmax.put(d);
                }
            }
            // aggregate
            momin.put(ominmax.getMin());
            momax.put(ominmax.getMax());
            modif.put(ominmax.getDiff());
            // min/max
            gominmax.put(ominmax.getMin());
            gominmax.put(ominmax.getMax());
            LOG.incrementProcessed(progress);
        }
    }
    LOG.ensureCompleted(progress);
    // Update values (only needed for sampling case).
    gminmax.put(gominmax);
    LOG.setCompleted(stepprog);
    // count the number of samples we have in the data
    long inum = 0;
    long onum = 0;
    for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
        inum += iter.getValue()[0];
        onum += iter.getValue()[1];
    }
    long bnum = inum + onum;
    Collection<double[]> binstat = new ArrayList<>(numbin);
    for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
        final long[] value = iter.getValue();
        final double icof = (inum == 0) ? 0 : ((double) value[0]) / inum / histogram.getBinsize();
        final double icaf = ((double) value[0]) / bnum / histogram.getBinsize();
        final double ocof = (onum == 0) ? 0 : ((double) value[1]) / onum / histogram.getBinsize();
        final double ocaf = ((double) value[1]) / bnum / histogram.getBinsize();
        binstat.add(new double[] { iter.getCenter(), icof, icaf, ocof, ocaf });
    }
    HistogramResult result = new HistogramResult("Distance Histogram", "distance-histogram", binstat);
    result.addHeader("Absolute minimum distance (abs): " + gminmax.getMin());
    result.addHeader("Absolute maximum distance (abs): " + gminmax.getMax());
    result.addHeader("In-Cluster minimum distance (abs, avg, stddev): " + giminmax.getMin() + " " + mimin.getMean() + " " + mimin.getSampleStddev());
    result.addHeader("In-Cluster maximum distance (abs, avg, stddev): " + giminmax.getMax() + " " + mimax.getMean() + " " + mimax.getSampleStddev());
    result.addHeader("Other-Cluster minimum distance (abs, avg, stddev): " + gominmax.getMin() + " " + momin.getMean() + " " + momin.getSampleStddev());
    result.addHeader("Other-Cluster maximum distance (abs, avg, stddev): " + gominmax.getMax() + " " + momax.getMean() + " " + momax.getSampleStddev());
    result.addHeader("Column description: bin center, in-cluster only frequency, in-cluster all frequency, other-cluster only frequency, other cluster all frequency");
    result.addHeader("In-cluster value count: " + inum + " other cluster value count: " + onum);
    return result;
}
Also used : ObjHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram) HistogramResult(de.lmu.ifi.dbs.elki.result.HistogramResult) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) AbstractObjDynamicHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.AbstractObjDynamicHistogram) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) ByLabelOrAllInOneClustering(de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) LongArrayStaticHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.LongArrayStaticHistogram) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) DoubleMinMax(de.lmu.ifi.dbs.elki.math.DoubleMinMax)

Example 15 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class KNNBenchmarkAlgorithm method run.

/**
 * Run the algorithm.
 *
 * @param database Database
 * @param relation Relation
 * @return Null result
 */
public Result run(Database database, Relation<O> relation) {
    // Get a distance and kNN query instance.
    DistanceQuery<O> distQuery = database.getDistanceQuery(relation, getDistanceFunction());
    KNNQuery<O> knnQuery = database.getKNNQuery(distQuery, k);
    // No query set - use original database.
    if (queries == null) {
        final DBIDs sample = DBIDUtil.randomSample(relation.getDBIDs(), sampling, random);
        FiniteProgress prog = LOG.isVeryVerbose() ? new FiniteProgress("kNN queries", sample.size(), LOG) : null;
        int hash = 0;
        MeanVariance mv = new MeanVariance(), mvdist = new MeanVariance();
        for (DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) {
            KNNList knns = knnQuery.getKNNForDBID(iditer, k);
            int ichecksum = 0;
            for (DBIDIter it = knns.iter(); it.valid(); it.advance()) {
                ichecksum += DBIDUtil.asInteger(it);
            }
            hash = Util.mixHashCodes(hash, ichecksum);
            mv.put(knns.size());
            mvdist.put(knns.getKNNDistance());
            LOG.incrementProcessed(prog);
        }
        LOG.ensureCompleted(prog);
        if (LOG.isStatistics()) {
            LOG.statistics("Result hashcode: " + hash);
            LOG.statistics("Mean number of results: " + mv.getMean() + " +- " + mv.getNaiveStddev());
            if (mvdist.getCount() > 0) {
                LOG.statistics("Mean k-distance: " + mvdist.getMean() + " +- " + mvdist.getNaiveStddev());
            }
        }
    } else {
        // Separate query set.
        TypeInformation res = getDistanceFunction().getInputTypeRestriction();
        MultipleObjectsBundle bundle = queries.loadData();
        int col = -1;
        for (int i = 0; i < bundle.metaLength(); i++) {
            if (res.isAssignableFromType(bundle.meta(i))) {
                col = i;
                break;
            }
        }
        if (col < 0) {
            throw new IncompatibleDataException("No compatible data type in query input was found. Expected: " + res.toString());
        }
        // Random sampling is a bit of hack, sorry.
        // But currently, we don't (yet) have an "integer random sample" function.
        DBIDRange sids = DBIDUtil.generateStaticDBIDRange(bundle.dataLength());
        final DBIDs sample = DBIDUtil.randomSample(sids, sampling, random);
        FiniteProgress prog = LOG.isVeryVerbose() ? new FiniteProgress("kNN queries", sample.size(), LOG) : null;
        int hash = 0;
        MeanVariance mv = new MeanVariance(), mvdist = new MeanVariance();
        for (DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) {
            int off = sids.binarySearch(iditer);
            assert (off >= 0);
            @SuppressWarnings("unchecked") O o = (O) bundle.data(off, col);
            KNNList knns = knnQuery.getKNNForObject(o, k);
            int ichecksum = 0;
            for (DBIDIter it = knns.iter(); it.valid(); it.advance()) {
                ichecksum += DBIDUtil.asInteger(it);
            }
            hash = Util.mixHashCodes(hash, ichecksum);
            mv.put(knns.size());
            mvdist.put(knns.getKNNDistance());
            LOG.incrementProcessed(prog);
        }
        LOG.ensureCompleted(prog);
        if (LOG.isStatistics()) {
            LOG.statistics("Result hashcode: " + hash);
            LOG.statistics("Mean number of results: " + mv.getMean() + " +- " + mv.getNaiveStddev());
            if (mvdist.getCount() > 0) {
                LOG.statistics("Mean k-distance: " + mvdist.getMean() + " +- " + mvdist.getNaiveStddev());
            }
        }
    }
    return null;
}
Also used : DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) TypeInformation(de.lmu.ifi.dbs.elki.data.type.TypeInformation) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) KNNList(de.lmu.ifi.dbs.elki.database.ids.KNNList) IncompatibleDataException(de.lmu.ifi.dbs.elki.utilities.exceptions.IncompatibleDataException) DBIDRange(de.lmu.ifi.dbs.elki.database.ids.DBIDRange)

Aggregations

MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)61 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)32 DoubleRelation (de.lmu.ifi.dbs.elki.database.relation.DoubleRelation)17 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)17 DoubleMinMax (de.lmu.ifi.dbs.elki.math.DoubleMinMax)15 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)13 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)9 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)9 MaterializedDoubleRelation (de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)9 OutlierResult (de.lmu.ifi.dbs.elki.result.outlier.OutlierResult)9 OutlierScoreMeta (de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta)9 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)8 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)8 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)7 Mean (de.lmu.ifi.dbs.elki.math.Mean)7 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)6 DoubleDBIDListIter (de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter)6 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)5 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)5 AbstractDataSourceTest (de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest)5