Search in sources :

Example 16 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class GaussianAffinityMatrixBuilder method computePij.

/**
 * Compute the pij from the distance matrix.
 *
 * @param dist Distance matrix.
 * @param sigma Kernel bandwidth sigma
 * @param initialScale Initial scale
 * @return Affinity matrix pij
 */
protected static double[][] computePij(double[][] dist, double sigma, double initialScale) {
    final int size = dist.length;
    final double msigmasq = -.5 / (sigma * sigma);
    double[][] pij = new double[size][size];
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Computing affinities", size, LOG) : null;
    Duration timer = LOG.isStatistics() ? LOG.newDuration(GaussianAffinityMatrixBuilder.class.getName() + ".runtime.pijmatrix").begin() : null;
    MeanVariance mv = LOG.isStatistics() ? new MeanVariance() : null;
    for (int i = 0; i < size; i++) {
        double logP = computeH(i, dist[i], pij[i], msigmasq);
        if (mv != null) {
            mv.put(FastMath.exp(logP));
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    if (LOG.isStatistics()) {
        // timer != null, mv != null
        LOG.statistics(timer.end());
        LOG.statistics(new DoubleStatistic(GaussianAffinityMatrixBuilder.class.getName() + ".perplexity.average", mv.getMean()));
        LOG.statistics(new DoubleStatistic(GaussianAffinityMatrixBuilder.class.getName() + ".perplexity.stddev", mv.getSampleStddev()));
    }
    // Scale pij to have the desired sum EARLY_EXAGGERATION
    double sum = 0.;
    for (int i = 1; i < size; i++) {
        final double[] pij_i = pij[i];
        for (int j = 0; j < i; j++) {
            // Nur über halbe Matrix!
            // Symmetrie herstellen
            sum += (pij_i[j] += pij[j][i]);
        }
    }
    // Scaling taken from original tSNE code:
    final double scale = initialScale / (2. * sum);
    for (int i = 1; i < size; i++) {
        final double[] pij_i = pij[i];
        for (int j = 0; j < i; j++) {
            pij_i[j] = pij[j][i] = MathUtil.max(pij_i[j] * scale, MIN_PIJ);
        }
    }
    return pij;
}
Also used : DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration)

Example 17 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class NearestNeighborAffinityMatrixBuilder method computePij.

/**
 * Compute the sparse pij using the nearest neighbors only.
 *
 * @param ids ID range
 * @param knnq kNN query
 * @param square Use squared distances
 * @param numberOfNeighbours Number of neighbors to get
 * @param pij Output of distances
 * @param indices Output of indexes
 * @param initialScale Initial scaling factor
 */
protected void computePij(DBIDRange ids, KNNQuery<?> knnq, boolean square, int numberOfNeighbours, double[][] pij, int[][] indices, double initialScale) {
    Duration timer = LOG.isStatistics() ? LOG.newDuration(this.getClass().getName() + ".runtime.neighborspijmatrix").begin() : null;
    final double logPerp = FastMath.log(perplexity);
    // Scratch arrays, resizable
    DoubleArray dists = new DoubleArray(numberOfNeighbours + 10);
    IntegerArray inds = new IntegerArray(numberOfNeighbours + 10);
    // Compute nearest-neighbor sparse affinity matrix
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Finding neighbors and optimizing perplexity", ids.size(), LOG) : null;
    MeanVariance mv = LOG.isStatistics() ? new MeanVariance() : null;
    for (DBIDArrayIter ix = ids.iter(); ix.valid(); ix.advance()) {
        dists.clear();
        inds.clear();
        KNNList neighbours = knnq.getKNNForDBID(ix, numberOfNeighbours + 1);
        convertNeighbors(ids, ix, square, neighbours, dists, inds);
        double beta = computeSigma(// 
        ix.getOffset(), // 
        dists, // 
        perplexity, // 
        logPerp, pij[ix.getOffset()] = new double[dists.size()]);
        if (mv != null) {
            // Sigma
            mv.put(beta > 0 ? FastMath.sqrt(.5 / beta) : 0.);
        }
        indices[ix.getOffset()] = inds.toArray();
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    // Sum of the sparse affinity matrix:
    double sum = 0.;
    for (int i = 0; i < pij.length; i++) {
        final double[] pij_i = pij[i];
        for (int j = 0; j < pij_i.length; j++) {
            sum += pij_i[j];
        }
    }
    final double scale = initialScale / (2 * sum);
    for (int i = 0; i < pij.length; i++) {
        final double[] pij_i = pij[i];
        for (int offi = 0; offi < pij_i.length; offi++) {
            int j = indices[i][offi];
            assert (i != j);
            int offj = containsIndex(indices[j], i);
            if (offj >= 0) {
                // Found
                assert (indices[j][offj] == i);
                // Exploit symmetry:
                if (i < j) {
                    // Symmetrize
                    final double val = pij_i[offi] + pij[j][offj];
                    pij_i[offi] = pij[j][offj] = MathUtil.max(val * scale, MIN_PIJ);
                }
            } else {
                // Not found
                // TODO: the original code produces a symmetric matrix
                // And it will now not sum to EARLY_EXAGGERATION anymore.
                pij_i[offi] = MathUtil.max(pij_i[offi] * scale, MIN_PIJ);
            }
        }
    }
    if (LOG.isStatistics()) {
        // timer != null, mv != null
        LOG.statistics(timer.end());
        LOG.statistics(new DoubleStatistic(NearestNeighborAffinityMatrixBuilder.class.getName() + ".sigma.average", mv.getMean()));
        LOG.statistics(new DoubleStatistic(NearestNeighborAffinityMatrixBuilder.class.getName() + ".sigma.stddev", mv.getSampleStddev()));
    }
}
Also used : DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration) DoubleArray(de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.DoubleArray) IntegerArray(de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.IntegerArray)

Example 18 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class DistanceStddevOutlier method run.

/**
 * Run the outlier detection algorithm
 *
 * @param database Database to use
 * @param relation Relation to analyze
 * @return Outlier score result
 */
public OutlierResult run(Database database, Relation<O> relation) {
    // Get a nearest neighbor query on the relation.
    KNNQuery<O> knnq = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k);
    // Output data storage
    WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_DB);
    // Track minimum and maximum scores
    DoubleMinMax minmax = new DoubleMinMax();
    // Iterate over all objects
    for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
        KNNList neighbors = knnq.getKNNForDBID(iter, k);
        // Aggregate distances
        MeanVariance mv = new MeanVariance();
        for (DoubleDBIDListIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
            // Skip the object itself. The 0 is not very informative.
            if (DBIDUtil.equal(iter, neighbor)) {
                continue;
            }
            mv.put(neighbor.doubleValue());
        }
        // Store score
        scores.putDouble(iter, mv.getSampleStddev());
    }
    // Wrap the result in the standard containers
    // Actual min-max, theoretical min-max!
    OutlierScoreMeta meta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0, Double.POSITIVE_INFINITY);
    DoubleRelation rel = new MaterializedDoubleRelation(relation.getDBIDs(), "stddev-outlier", scores);
    return new OutlierResult(meta, rel);
}
Also used : DoubleDBIDListIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter) WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) OutlierResult(de.lmu.ifi.dbs.elki.result.outlier.OutlierResult) DoubleRelation(de.lmu.ifi.dbs.elki.database.relation.DoubleRelation) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation) BasicOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta) OutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta) BasicOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) DoubleMinMax(de.lmu.ifi.dbs.elki.math.DoubleMinMax) KNNList(de.lmu.ifi.dbs.elki.database.ids.KNNList) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)

Example 19 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class IndexPurity method processNewResult.

@Override
public void processNewResult(ResultHierarchy hier, Result newResult) {
    Database database = ResultUtil.findDatabase(hier);
    final ArrayList<SpatialIndexTree<?, ?>> indexes = ResultUtil.filterResults(hier, newResult, SpatialIndexTree.class);
    if (indexes == null || indexes.isEmpty()) {
        return;
    }
    Relation<String> lblrel = DatabaseUtil.guessLabelRepresentation(database);
    for (SpatialIndexTree<?, ?> index : indexes) {
        List<? extends SpatialEntry> leaves = index.getLeaves();
        MeanVariance mv = new MeanVariance();
        for (SpatialEntry e : leaves) {
            SpatialDirectoryEntry leaf = (SpatialDirectoryEntry) e;
            Node<?> n = index.getNode(leaf.getPageID());
            final int total = n.getNumEntries();
            HashMap<String, Integer> map = new HashMap<>(total);
            for (int i = 0; i < total; i++) {
                DBID id = ((SpatialPointLeafEntry) n.getEntry(i)).getDBID();
                String label = lblrel.get(id);
                Integer val = map.get(label);
                if (val == null) {
                    val = 1;
                } else {
                    val += 1;
                }
                map.put(label, val);
            }
            double gini = 0.0;
            for (Entry<String, Integer> ent : map.entrySet()) {
                double rel = ent.getValue() / (double) total;
                gini += rel * rel;
            }
            mv.put(gini);
        }
        Collection<double[]> col = new ArrayList<>();
        col.add(new double[] { mv.getMean(), mv.getSampleStddev() });
        database.getHierarchy().add((Result) index, new CollectionResult<>("Gini coefficient of index", "index-gini", col));
    }
}
Also used : SpatialPointLeafEntry(de.lmu.ifi.dbs.elki.index.tree.spatial.SpatialPointLeafEntry) HashMap(java.util.HashMap) DBID(de.lmu.ifi.dbs.elki.database.ids.DBID) ArrayList(java.util.ArrayList) SpatialIndexTree(de.lmu.ifi.dbs.elki.index.tree.spatial.SpatialIndexTree) SpatialEntry(de.lmu.ifi.dbs.elki.index.tree.spatial.SpatialEntry) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) SpatialDirectoryEntry(de.lmu.ifi.dbs.elki.index.tree.spatial.SpatialDirectoryEntry) Database(de.lmu.ifi.dbs.elki.database.Database)

Example 20 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class HopkinsStatisticClusteringTendency method run.

/**
 * Runs the algorithm in the timed evaluation part.
 *
 * @param database Database context
 * @param relation Relation to analyze
 */
public Result run(Database database, Relation<NumberVector> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    final DistanceQuery<NumberVector> distanceQuery = database.getDistanceQuery(relation, getDistanceFunction());
    final KNNQuery<NumberVector> knnQuery = database.getKNNQuery(distanceQuery, k + 1);
    final double[] min = new double[dim], extend = new double[dim];
    initializeDataExtends(relation, dim, min, extend);
    if (!LOG.isStatistics()) {
        LOG.warning("This algorithm must be used with at least logging level " + Level.STATISTICS);
    }
    MeanVariance hmean = new MeanVariance(), umean = new MeanVariance(), wmean = new MeanVariance();
    // more stable result
    for (int j = 0; j < this.rep; j++) {
        // Compute NN distances for random objects from within the database
        double w = computeNNForRealData(knnQuery, relation, dim);
        // Compute NN distances for randomly created new uniform objects
        double u = computeNNForUniformData(knnQuery, min, extend);
        // compute hopkins statistik
        // = a / (1+a)
        double h = u / (u + w);
        hmean.put(h);
        umean.put(u);
        wmean.put(w);
    }
    final String prefix = this.getClass().getName();
    LOG.statistics(new LongStatistic(prefix + ".samplesize", sampleSize));
    LOG.statistics(new LongStatistic(prefix + ".dim", dim));
    LOG.statistics(new LongStatistic(prefix + ".hopkins.nearest-neighbor", k));
    LOG.statistics(new DoubleStatistic(prefix + ".hopkins.h.mean", hmean.getMean()));
    LOG.statistics(new DoubleStatistic(prefix + ".hopkins.u.mean", umean.getMean()));
    LOG.statistics(new DoubleStatistic(prefix + ".hopkins.w.mean", wmean.getMean()));
    if (rep > 1) {
        LOG.statistics(new DoubleStatistic(prefix + ".hopkins.h.std", hmean.getSampleStddev()));
        LOG.statistics(new DoubleStatistic(prefix + ".hopkins.u.std", umean.getSampleStddev()));
        LOG.statistics(new DoubleStatistic(prefix + ".hopkins.w.std", wmean.getSampleStddev()));
    }
    // Evaluate:
    double x = hmean.getMean();
    // See Hopkins for a proof that x is supposedly Beta distributed.
    double ix = BetaDistribution.regularizedIncBeta(x, sampleSize, sampleSize);
    double p = (x > .5) ? (1. - ix) : ix;
    LOG.statistics(new DoubleStatistic(prefix + ".hopkins.p", p));
    return null;
}
Also used : DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Aggregations

MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)61 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)32 DoubleRelation (de.lmu.ifi.dbs.elki.database.relation.DoubleRelation)17 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)17 DoubleMinMax (de.lmu.ifi.dbs.elki.math.DoubleMinMax)15 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)13 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)9 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)9 MaterializedDoubleRelation (de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)9 OutlierResult (de.lmu.ifi.dbs.elki.result.outlier.OutlierResult)9 OutlierScoreMeta (de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta)9 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)8 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)8 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)7 Mean (de.lmu.ifi.dbs.elki.math.Mean)7 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)6 DoubleDBIDListIter (de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter)6 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)5 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)5 AbstractDataSourceTest (de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest)5