Search in sources :

Example 1 with DoubleMinHeap

use of de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap in project elki by elki-project.

the class LBABOD method run.

/**
 * Run LB-ABOD on the data set.
 *
 * @param relation Relation to process
 * @return Outlier detection result
 */
@Override
public OutlierResult run(Database db, Relation<V> relation) {
    ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    DBIDArrayIter pB = ids.iter(), pC = ids.iter();
    SimilarityQuery<V> sq = db.getSimilarityQuery(relation, kernelFunction);
    KernelMatrix kernelMatrix = new KernelMatrix(sq, relation, ids);
    // Output storage.
    WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
    DoubleMinMax minmaxabod = new DoubleMinMax();
    double max = 0.;
    // Storage for squared distances (will be reused!)
    WritableDoubleDataStore sqDists = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
    // Nearest neighbor heap (will be reused!)
    KNNHeap nn = DBIDUtil.newHeap(k);
    // Priority queue for candidates
    ModifiableDoubleDBIDList candidates = DBIDUtil.newDistanceDBIDList(relation.size());
    // get Candidate Ranking
    for (DBIDIter pA = relation.iterDBIDs(); pA.valid(); pA.advance()) {
        // Compute nearest neighbors and distances.
        nn.clear();
        double simAA = kernelMatrix.getSimilarity(pA, pA);
        // Sum of 1./(|AB|) and 1./(|AB|^2); for computing R2.
        double sumid = 0., sumisqd = 0.;
        for (pB.seek(0); pB.valid(); pB.advance()) {
            if (DBIDUtil.equal(pB, pA)) {
                continue;
            }
            double simBB = kernelMatrix.getSimilarity(pB, pB);
            double simAB = kernelMatrix.getSimilarity(pA, pB);
            double sqdAB = simAA + simBB - simAB - simAB;
            sqDists.putDouble(pB, sqdAB);
            final double isqdAB = 1. / sqdAB;
            sumid += FastMath.sqrt(isqdAB);
            sumisqd += isqdAB;
            // Update heap
            nn.insert(sqdAB, pB);
        }
        // Compute FastABOD approximation, adjust for lower bound.
        // LB-ABOF is defined via a numerically unstable formula.
        // Variance as E(X^2)-E(X)^2 suffers from catastrophic cancellation!
        // TODO: ensure numerical precision!
        double nnsum = 0., nnsumsq = 0., nnsumisqd = 0.;
        KNNList nl = nn.toKNNList();
        DoubleDBIDListIter iB = nl.iter(), iC = nl.iter();
        for (; iB.valid(); iB.advance()) {
            double sqdAB = iB.doubleValue();
            double simAB = kernelMatrix.getSimilarity(pA, iB);
            if (!(sqdAB > 0.)) {
                continue;
            }
            for (iC.seek(iB.getOffset() + 1); iC.valid(); iC.advance()) {
                double sqdAC = iC.doubleValue();
                double simAC = kernelMatrix.getSimilarity(pA, iC);
                if (!(sqdAC > 0.)) {
                    continue;
                }
                // Exploit bilinearity of scalar product:
                // <B-A, C-A> = <B, C-A> - <A,C-A>
                // = <B,C> - <B,A> - <A,C> + <A,A>
                double simBC = kernelMatrix.getSimilarity(iB, iC);
                double numerator = simBC - simAB - simAC + simAA;
                double sqweight = 1. / (sqdAB * sqdAC);
                double weight = FastMath.sqrt(sqweight);
                double val = numerator * sqweight;
                nnsum += val * weight;
                nnsumsq += val * val * weight;
                nnsumisqd += sqweight;
            }
        }
        // Remaining weight, term R2:
        double r2 = sumisqd * sumisqd - 2. * nnsumisqd;
        double tmp = (2. * nnsum + r2) / (sumid * sumid);
        double lbabof = 2. * nnsumsq / (sumid * sumid) - tmp * tmp;
        // Track maximum?
        if (lbabof > max) {
            max = lbabof;
        }
        abodvalues.putDouble(pA, lbabof);
        candidates.add(lbabof, pA);
    }
    // Put maximum from approximate values.
    minmaxabod.put(max);
    candidates.sort();
    // refine Candidates
    int refinements = 0;
    DoubleMinHeap topscores = new DoubleMinHeap(l);
    MeanVariance s = new MeanVariance();
    for (DoubleDBIDListIter pA = candidates.iter(); pA.valid(); pA.advance()) {
        // Stop refining
        if (topscores.size() >= k && pA.doubleValue() > topscores.peek()) {
            break;
        }
        final double abof = computeABOF(kernelMatrix, pA, pB, pC, s);
        // Store refined score:
        abodvalues.putDouble(pA, abof);
        minmaxabod.put(abof);
        // Update the heap tracking the top scores.
        if (topscores.size() < k) {
            topscores.add(abof);
        } else {
            if (topscores.peek() > abof) {
                topscores.replaceTopElement(abof);
            }
        }
        refinements += 1;
    }
    if (LOG.isStatistics()) {
        LoggingConfiguration.setVerbose(Level.VERYVERBOSE);
        LOG.statistics(new LongStatistic("lb-abod.refinements", refinements));
    }
    // Build result representation.
    DoubleRelation scoreResult = new MaterializedDoubleRelation("Angle-based Outlier Detection", "abod-outlier", abodvalues, ids);
    OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY);
    return new OutlierResult(scoreMeta, scoreResult);
}
Also used : DoubleDBIDListIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter) DoubleMinHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap) WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) ModifiableDoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList) OutlierResult(de.lmu.ifi.dbs.elki.result.outlier.OutlierResult) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) InvertedOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta) KNNHeap(de.lmu.ifi.dbs.elki.database.ids.KNNHeap) DoubleRelation(de.lmu.ifi.dbs.elki.database.relation.DoubleRelation) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation) OutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta) InvertedOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) KernelMatrix(de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.KernelMatrix) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) DoubleMinMax(de.lmu.ifi.dbs.elki.math.DoubleMinMax) KNNList(de.lmu.ifi.dbs.elki.database.ids.KNNList) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)

Example 2 with DoubleMinHeap

use of de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap in project elki by elki-project.

the class SURFINGDependenceMeasure method dependence.

@// 
Reference(// 
authors = "Christian Baumgartner, Claudia Plant, Karin Kailing, Hans-Peter Kriegel, and Peer Kröger", // 
title = "Subspace Selection for Clustering High-Dimensional Data", // 
booktitle = "IEEE International Conference on Data Mining, 2004", url = "http://dx.doi.org/10.1109/ICDM.2004.10112")
@Override
public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
    final int len = size(adapter1, data1, adapter2, data2);
    final int k = Math.max(1, len / 10);
    double[] knns = new double[len];
    DoubleMinHeap heap = new DoubleMinHeap(k);
    double kdistmean = 0.;
    for (int i = 0; i < len; ++i) {
        double ix = adapter1.getDouble(data1, i), iy = adapter2.getDouble(data2, i);
        heap.clear();
        for (int j = 0; j < len; ++j) {
            double jx = adapter1.getDouble(data1, j), jy = adapter2.getDouble(data2, j);
            double dx = ix - jx, dy = iy - jy;
            // Squared Euclidean.
            heap.add(dx * dx + dy * dy);
        }
        // Euclidean
        double kdist = FastMath.sqrt(heap.peek());
        knns[i] = kdist;
        kdistmean += kdist;
    }
    kdistmean /= len;
    // Deviation from mean:
    double diff = 0.;
    int below = 0;
    for (int l = 0; l < knns.length; l++) {
        diff += Math.abs(kdistmean - knns[l]);
        if (knns[l] < kdistmean) {
            below++;
        }
    }
    return (below > 0) ? diff / (2. * kdistmean * below) : 0;
}
Also used : DoubleMinHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap) Reference(de.lmu.ifi.dbs.elki.utilities.documentation.Reference)

Example 3 with DoubleMinHeap

use of de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap in project elki by elki-project.

the class KMeansMinusMinus method run.

@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    // Intialisieren der means
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // initialisieren vom Heap
    final int heapsize = (int) (rate < 1. ? Math.ceil(relation.size() * rate) : rate);
    DoubleMinHeap minHeap = new DoubleMinHeap(heapsize);
    // Setup cluster assignment store
    List<ModifiableDoubleDBIDList> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        clusters.add(DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat = new DoubleStatistic(this.getClass().getName() + ".variance-sum");
    // Otherwise, the vartotal break below will fail!
    assert (varstat != null);
    int iteration = 0;
    double prevvartotal = Double.POSITIVE_INFINITY;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
        minHeap.clear();
        for (int i = 0; i < k; i++) {
            clusters.get(i).clear();
        }
        LOG.incrementProcessed(prog);
        boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum, minHeap, heapsize);
        double vartotal = logVarstat(varstat, varsum);
        // than the previous value.
        if (!changed || vartotal > prevvartotal) {
            break;
        }
        prevvartotal = vartotal;
        // Recompute means.
        means = meansWithTreshhold(clusters, means, relation, heapsize > 0 ? minHeap.peek() : Double.POSITIVE_INFINITY);
    }
    // create noisecluster if wanted
    ModifiableDoubleDBIDList noiseids = null;
    if (noiseFlag && heapsize > 0) {
        clusters.add(noiseids = DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
        double tresh = minHeap.peek();
        for (int i = 0; i < k; i++) {
            for (DoubleDBIDListMIter it = clusters.get(i).iter(); it.valid(); it.advance()) {
                final double dist = it.doubleValue();
                // Add to the noise cluster:
                if (dist >= tresh) {
                    noiseids.add(dist, it);
                    assignment.putInt(it, k);
                    it.remove();
                }
            }
        }
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < k; i++) {
        DBIDs ids = clusters.get(i);
        if (ids.size() == 0) {
            continue;
        }
        KMeansModel model = new KMeansModel(means[i], varsum[i]);
        result.addToplevelCluster(new Cluster<>(ids, model));
    }
    // Noise Cluster
    if (noiseFlag) {
        KMeansModel model = new KMeansModel(null, 0);
        DBIDs ids = noiseids;
        if (ids.size() == 0) {
            return result;
        }
        result.addToplevelCluster(new Cluster<>(ids, true, model));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) DoubleMinHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap) ModifiableDoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) DoubleDBIDListMIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListMIter) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 4 with DoubleMinHeap

use of de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap in project elki by elki-project.

the class EvaluateCIndex method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return C-Index
 */
public double evaluateClustering(Database db, Relation<? extends O> rel, DistanceQuery<O> dq, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    // Count ignored noise, and within-cluster distances
    int ignorednoise = 0, w = 0;
    for (Cluster<?> cluster : clusters) {
        if (cluster.size() <= 1 || cluster.isNoise()) {
            switch(noiseOption) {
                case IGNORE_NOISE:
                    ignorednoise += cluster.size();
                    // Ignore
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    // No within-cluster distances!
                    continue;
                case MERGE_NOISE:
                    // Treat like a cluster
                    break;
                default:
                    LOG.warning("Unknown noise handling option: " + noiseOption);
            }
        }
        w += (cluster.size() * (cluster.size() - 1)) >>> 1;
    }
    // TODO: for small k=2, and balanced clusters, it may be more efficient to
    // just build a long array with all distances, and select the quantiles.
    // The heaps used below pay off in memory consumption for k > 2
    // Yes, maxDists is supposed to be a min heap, and the other way.
    // Because we want to replace the smallest of the current k-largest
    // distances.
    DoubleHeap maxDists = new DoubleMinHeap(w);
    DoubleHeap minDists = new DoubleMaxHeap(w);
    // Sum of within-cluster distances
    double theta = 0.;
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Processing clusters for C-Index", clusters.size(), LOG) : null;
    for (int i = 0; i < clusters.size(); i++) {
        Cluster<?> cluster = clusters.get(i);
        if (cluster.size() <= 1 || cluster.isNoise()) {
            switch(noiseOption) {
                case IGNORE_NOISE:
                    LOG.incrementProcessed(prog);
                    // Ignore
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    processSingleton(cluster, rel, dq, maxDists, minDists, w);
                    LOG.incrementProcessed(prog);
                    continue;
                case MERGE_NOISE:
                    // Treat like a cluster, below
                    break;
            }
        }
        theta += processCluster(cluster, clusters, i, dq, maxDists, minDists, w);
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    // Simulate best and worst cases:
    // Sum of largest and smallest
    double min = 0, max = 0;
    assert (minDists.size() == w);
    assert (maxDists.size() == w);
    for (DoubleHeap.UnsortedIter it = minDists.unsortedIter(); it.valid(); it.advance()) {
        min += it.get();
    }
    for (DoubleHeap.UnsortedIter it = maxDists.unsortedIter(); it.valid(); it.advance()) {
        max += it.get();
    }
    assert (max >= min);
    double cIndex = (max > min) ? (theta - min) / (max - min) : 1.;
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".c-index.noise-handling", noiseOption.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".c-index.ignored", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".c-index", cIndex));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("C-Index", cIndex, 0., 1., 0., true);
    db.getHierarchy().resultChanged(ev);
    return cIndex;
}
Also used : DoubleMinHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DoubleHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleHeap) MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DoubleMaxHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMaxHeap) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Aggregations

DoubleMinHeap (de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap)4 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)3 ModifiableDoubleDBIDList (de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList)2 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)2 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)2 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)1 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)1 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)1 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)1 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)1 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)1 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)1 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)1 DoubleDBIDListIter (de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter)1 DoubleDBIDListMIter (de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListMIter)1 KNNHeap (de.lmu.ifi.dbs.elki.database.ids.KNNHeap)1 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)1 DoubleRelation (de.lmu.ifi.dbs.elki.database.relation.DoubleRelation)1 MaterializedDoubleRelation (de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)1 KernelMatrix (de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.KernelMatrix)1