Search in sources :

Example 1 with FiniteProgress

use of de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress in project elki by elki-project.

the class NaiveAgglomerativeHierarchicalClustering2 method run.

/**
 * Run the algorithm
 *
 * @param db Database
 * @param relation Relation
 * @return Clustering hierarchy
 */
public Result run(Database db, Relation<O> relation) {
    DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
    ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    final int size = ids.size();
    if (size > 0x10000) {
        throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow.");
    }
    LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
    // Compute the initial (lower triangular) distance matrix.
    double[] scratch = new double[triangleSize(size)];
    DBIDArrayIter ix = ids.iter(), iy = ids.iter();
    // Position counter - must agree with computeOffset!
    int pos = 0;
    for (int x = 0; ix.valid(); x++, ix.advance()) {
        iy.seek(0);
        for (int y = 0; y < x; y++, iy.advance()) {
            scratch[pos] = dq.distance(ix, iy);
            pos++;
        }
    }
    // Initialize space for result:
    double[] height = new double[size];
    Arrays.fill(height, Double.POSITIVE_INFINITY);
    // Parent node, to track merges
    // have every object point to itself initially
    ArrayModifiableDBIDs parent = DBIDUtil.newArray(ids);
    // Active clusters, when not trivial.
    Int2ReferenceMap<ModifiableDBIDs> clusters = new Int2ReferenceOpenHashMap<>();
    // Repeat until everything merged, except the desired number of clusters:
    final int stop = size - numclusters;
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", stop, LOG) : null;
    for (int i = 0; i < stop; i++) {
        double min = Double.POSITIVE_INFINITY;
        int minx = -1, miny = -1;
        for (int x = 0; x < size; x++) {
            if (height[x] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int xbase = triangleSize(x);
            for (int y = 0; y < x; y++) {
                if (height[y] < Double.POSITIVE_INFINITY) {
                    continue;
                }
                final int idx = xbase + y;
                if (scratch[idx] < min) {
                    min = scratch[idx];
                    minx = x;
                    miny = y;
                }
            }
        }
        assert (minx >= 0 && miny >= 0);
        // Avoid allocating memory, by reusing existing iterators:
        ix.seek(minx);
        iy.seek(miny);
        // Perform merge in data structure: x -> y
        // Since y < x, prefer keeping y, dropping x.
        height[minx] = min;
        parent.set(minx, iy);
        // Merge into cluster
        ModifiableDBIDs cx = clusters.get(minx);
        ModifiableDBIDs cy = clusters.get(miny);
        if (cy == null) {
            cy = DBIDUtil.newHashSet();
            cy.add(iy);
        }
        if (cx == null) {
            cy.add(ix);
        } else {
            cy.addDBIDs(cx);
            clusters.remove(minx);
        }
        clusters.put(miny, cy);
        // Update distance matrix. Note: miny < minx
        final int xbase = triangleSize(minx), ybase = triangleSize(miny);
        // Write to (y, j), with j < y
        for (int j = 0; j < miny; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            scratch[ybase + j] = Math.min(scratch[xbase + j], scratch[ybase + j]);
        }
        // Write to (j, y), with y < j < x
        for (int j = miny + 1; j < minx; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int jbase = triangleSize(j);
            scratch[jbase + miny] = Math.min(scratch[xbase + j], scratch[jbase + miny]);
        }
        // Write to (j, y), with y < x < j
        for (int j = minx + 1; j < size; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int jbase = triangleSize(j);
            scratch[jbase + miny] = Math.min(scratch[jbase + minx], scratch[jbase + miny]);
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    // Build the clustering result
    final Clustering<Model> dendrogram = new Clustering<>("Hierarchical-Clustering", "hierarchical-clustering");
    for (int x = 0; x < size; x++) {
        if (height[x] < Double.POSITIVE_INFINITY) {
            DBIDs cids = clusters.get(x);
            if (cids == null) {
                ix.seek(x);
                cids = DBIDUtil.deref(ix);
            }
            Cluster<Model> cluster = new Cluster<>("Cluster", cids);
            dendrogram.addToplevelCluster(cluster);
        }
    }
    return dendrogram;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) Model(de.lmu.ifi.dbs.elki.data.model.Model) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException) Int2ReferenceOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ReferenceOpenHashMap)

Example 2 with FiniteProgress

use of de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress in project elki by elki-project.

the class MiniMaxAnderberg method run.

/**
 * Run the algorithm
 *
 * @param db Database
 * @param relation Relation
 * @return Clustering hierarchy
 */
public PointerHierarchyRepresentationResult run(Database db, Relation<O> relation) {
    DistanceQuery<O> dq = DatabaseUtil.precomputedDistanceQuery(db, relation, getDistanceFunction(), LOG);
    final DBIDs ids = relation.getDBIDs();
    final int size = ids.size();
    // Initialize space for result:
    PointerHierarchyRepresentationBuilder builder = new PointerHierarchyRepresentationBuilder(ids, dq.getDistanceFunction().isSquared());
    Int2ObjectOpenHashMap<ModifiableDBIDs> clusters = new Int2ObjectOpenHashMap<>();
    // Compute the initial (lower triangular) distance matrix.
    MatrixParadigm mat = new MatrixParadigm(ids);
    ArrayModifiableDBIDs prots = DBIDUtil.newArray(MatrixParadigm.triangleSize(size));
    DBIDArrayMIter protiter = prots.iter();
    MiniMax.initializeMatrices(mat, prots, dq);
    // Arrays used for caching:
    double[] bestd = new double[size];
    int[] besti = new int[size];
    initializeNNCache(mat.matrix, bestd, besti);
    // Repeat until everything merged into 1 cluster
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null;
    DBIDArrayIter ix = mat.ix;
    for (int i = 1, end = size; i < size; i++) {
        end = // 
        AGNES.shrinkActiveSet(// 
        ix, // 
        builder, // 
        end, findMerge(end, mat, protiter, builder, clusters, bestd, besti, dq));
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    return (PointerPrototypeHierarchyRepresentationResult) builder.complete();
}
Also used : Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)

Example 3 with FiniteProgress

use of de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress in project elki by elki-project.

the class NNChain method nnChainCore.

/**
 * Uses NNChain as in "Modern hierarchical, agglomerative clustering
 * algorithms" by Daniel Müllner
 *
 * @param mat Matrix view
 * @param builder Result builder
 */
private void nnChainCore(MatrixParadigm mat, PointerHierarchyRepresentationBuilder builder) {
    final DBIDArrayIter ix = mat.ix;
    final double[] distances = mat.matrix;
    final int size = mat.size;
    // The maximum chain size = number of ids + 1
    IntegerArray chain = new IntegerArray(size + 1);
    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Running NNChain", size - 1, LOG) : null;
    for (int k = 1, end = size; k < size; k++) {
        int a = -1, b = -1;
        if (chain.size() <= 3) {
            // Accessing two arbitrary not yet merged elements could be optimized to
            // work in O(1) like in Müllner;
            // however this usually does not have a huge impact (empirically just
            // about 1/5000 of total performance)
            a = findUnlinked(0, end, ix, builder);
            b = findUnlinked(a + 1, end, ix, builder);
            chain.clear();
            chain.add(a);
        } else {
            // Chain is expected to look like (.... a, b, c, b) with b and c merged.
            int lastIndex = chain.size;
            int c = chain.get(lastIndex - 2);
            b = chain.get(lastIndex - 3);
            a = chain.get(lastIndex - 4);
            // Ensure we had a loop at the end:
            assert (chain.get(lastIndex - 1) == c || chain.get(lastIndex - 1) == b);
            // if c < b, then we merged b -> c, otherwise c -> b
            b = c < b ? c : b;
            // Cut the tail:
            chain.size -= 3;
        }
        // For ties, always prefer the second-last element b:
        double minDist = mat.get(a, b);
        do {
            int c = b;
            final int ta = MatrixParadigm.triangleSize(a);
            for (int i = 0; i < a; i++) {
                if (i != b && !builder.isLinked(ix.seek(i))) {
                    double dist = distances[ta + i];
                    if (dist < minDist) {
                        minDist = dist;
                        c = i;
                    }
                }
            }
            for (int i = a + 1; i < size; i++) {
                if (i != b && !builder.isLinked(ix.seek(i))) {
                    double dist = distances[MatrixParadigm.triangleSize(i) + a];
                    if (dist < minDist) {
                        minDist = dist;
                        c = i;
                    }
                }
            }
            b = a;
            a = c;
            chain.add(a);
        } while (chain.size() < 3 || a != chain.get(chain.size - 1 - 2));
        // We always merge the larger into the smaller index:
        if (a < b) {
            int tmp = a;
            a = b;
            b = tmp;
        }
        assert (minDist == mat.get(a, b));
        assert (b < a);
        merge(size, mat, builder, minDist, a, b);
        // Shrink working set
        end = AGNES.shrinkActiveSet(ix, builder, end, a);
        LOG.incrementProcessed(progress);
    }
    LOG.ensureCompleted(progress);
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) IntegerArray(de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.IntegerArray)

Example 4 with FiniteProgress

use of de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress in project elki by elki-project.

the class SLINKHDBSCANLinearMemory method run.

/**
 * Run the algorithm
 *
 * @param db Database
 * @param relation Relation
 * @return Clustering hierarchy
 */
public PointerDensityHierarchyRepresentationResult run(Database db, Relation<O> relation) {
    final DistanceQuery<O> distQ = db.getDistanceQuery(relation, getDistanceFunction());
    final KNNQuery<O> knnQ = db.getKNNQuery(distQ, minPts);
    // We need array addressing later.
    final ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    // Compute the core distances
    // minPts + 1: ignore query point.
    final WritableDoubleDataStore coredists = computeCoreDists(ids, knnQ, minPts);
    WritableDBIDDataStore pi = DataStoreUtil.makeDBIDStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
    WritableDoubleDataStore lambda = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, Double.POSITIVE_INFINITY);
    // Temporary storage for m.
    WritableDoubleDataStore m = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Running HDBSCAN*-SLINK", ids.size(), LOG) : null;
    // has to be an array for monotonicity reasons!
    ModifiableDBIDs processedIDs = DBIDUtil.newArray(ids.size());
    for (DBIDIter id = ids.iter(); id.valid(); id.advance()) {
        // Steps 1,3,4 are exactly as in SLINK
        step1(id, pi, lambda);
        // Step 2 is modified to use a different distance
        step2(id, processedIDs, distQ, coredists, m);
        step3(id, pi, lambda, processedIDs, m);
        step4(id, pi, lambda, processedIDs);
        processedIDs.add(id);
        LOG.incrementProcessed(progress);
    }
    LOG.ensureCompleted(progress);
    return new PointerDensityHierarchyRepresentationResult(ids, pi, lambda, distQ.getDistanceFunction().isSquared(), coredists);
}
Also used : WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) WritableDBIDDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDBIDDataStore) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 5 with FiniteProgress

use of de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress in project elki by elki-project.

the class LSDBC method fillDensities.

/**
 * Collect all densities into an array for sorting.
 *
 * @param knnq kNN query
 * @param ids DBIDs to process
 * @param dens Density storage
 */
private void fillDensities(KNNQuery<O> knnq, DBIDs ids, WritableDoubleDataStore dens) {
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Densities", ids.size(), LOG) : null;
    for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
        final KNNList neighbors = knnq.getKNNForDBID(iter, k);
        dens.putDouble(iter, neighbors.getKNNDistance());
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)

Aggregations

FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)145 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)78 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)34 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)33 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)29 DoubleMinMax (de.lmu.ifi.dbs.elki.math.DoubleMinMax)25 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)25 MaterializedDoubleRelation (de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)23 OutlierResult (de.lmu.ifi.dbs.elki.result.outlier.OutlierResult)23 OutlierScoreMeta (de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta)23 DoubleRelation (de.lmu.ifi.dbs.elki.database.relation.DoubleRelation)21 DoubleDBIDListIter (de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter)20 ArrayList (java.util.ArrayList)18 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)17 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)17 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)16 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)16 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)14 Duration (de.lmu.ifi.dbs.elki.logging.statistics.Duration)13 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)12