Search in sources :

Example 66 with AbortException

use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.

the class NaiveAgglomerativeHierarchicalClustering4 method run.

/**
 * Run the algorithm
 *
 * @param db Database
 * @param relation Relation
 * @return Clustering hierarchy
 */
public PointerHierarchyRepresentationResult run(Database db, Relation<O> relation) {
    DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
    ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    final int size = ids.size();
    if (size > 0x10000) {
        throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow.");
    }
    if (Linkage.SINGLE.equals(linkage)) {
        LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
    }
    // Compute the initial (lower triangular) distance matrix.
    double[] scratch = new double[triangleSize(size)];
    DBIDArrayIter ix = ids.iter(), iy = ids.iter(), ij = ids.iter();
    // Position counter - must agree with computeOffset!
    int pos = 0;
    boolean square = Linkage.WARD.equals(linkage) && !getDistanceFunction().isSquared();
    for (int x = 0; ix.valid(); x++, ix.advance()) {
        iy.seek(0);
        for (int y = 0; y < x; y++, iy.advance()) {
            scratch[pos] = dq.distance(ix, iy);
            // Ward uses variances -- i.e. squared values
            if (square) {
                scratch[pos] *= scratch[pos];
            }
            pos++;
        }
    }
    // Initialize space for result:
    WritableDBIDDataStore parent = DataStoreUtil.makeDBIDStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
    WritableDoubleDataStore height = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
    WritableIntegerDataStore csize = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
    for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
        parent.put(it, it);
        height.put(it, Double.POSITIVE_INFINITY);
        csize.put(it, 1);
    }
    // Repeat until everything merged, except the desired number of clusters:
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null;
    for (int i = 1; i < size; i++) {
        double min = Double.POSITIVE_INFINITY;
        int minx = -1, miny = -1;
        for (ix.seek(0); ix.valid(); ix.advance()) {
            if (height.doubleValue(ix) < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int xbase = triangleSize(ix.getOffset());
            for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) {
                if (height.doubleValue(iy) < Double.POSITIVE_INFINITY) {
                    continue;
                }
                final int idx = xbase + iy.getOffset();
                if (scratch[idx] <= min) {
                    min = scratch[idx];
                    minx = ix.getOffset();
                    miny = iy.getOffset();
                }
            }
        }
        assert (minx >= 0 && miny >= 0);
        // Avoid allocating memory, by reusing existing iterators:
        ix.seek(minx);
        iy.seek(miny);
        // Perform merge in data structure: x -> y
        // Since y < x, prefer keeping y, dropping x.
        int sizex = csize.intValue(ix), sizey = csize.intValue(iy);
        height.put(ix, min);
        parent.put(ix, iy);
        csize.put(iy, sizex + sizey);
        // Update distance matrix. Note: miny < minx
        final int xbase = triangleSize(minx), ybase = triangleSize(miny);
        // Write to (y, j), with j < y
        for (ij.seek(0); ij.getOffset() < miny; ij.advance()) {
            if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int sizej = csize.intValue(ij);
            scratch[ybase + ij.getOffset()] = linkage.combine(sizex, scratch[xbase + ij.getOffset()], sizey, scratch[ybase + ij.getOffset()], sizej, min);
        }
        // Write to (j, y), with y < j < x
        for (ij.seek(miny + 1); ij.getOffset() < minx; ij.advance()) {
            if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int jbase = triangleSize(ij.getOffset());
            final int sizej = csize.intValue(ij);
            scratch[jbase + miny] = linkage.combine(sizex, scratch[xbase + ij.getOffset()], sizey, scratch[jbase + miny], sizej, min);
        }
        // Write to (j, y), with y < x < j
        for (ij.seek(minx + 1); ij.valid(); ij.advance()) {
            if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int jbase = triangleSize(ij.getOffset());
            final int sizej = csize.intValue(ij);
            scratch[jbase + miny] = linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min);
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    return new PointerHierarchyRepresentationResult(ids, parent, height, dq.getDistanceFunction().isSquared());
}
Also used : PointerHierarchyRepresentationResult(de.lmu.ifi.dbs.elki.algorithm.clustering.hierarchical.PointerHierarchyRepresentationResult) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 67 with AbortException

use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.

the class StratifiedCrossValidation method initialize.

@Override
public void initialize(MultipleObjectsBundle bundle) {
    super.initialize(bundle);
    fold = 0;
    IntArrayList[] classBuckets = new IntArrayList[this.labels.size()];
    for (int i = 0; i < this.labels.size(); i++) {
        classBuckets[i] = new IntArrayList();
    }
    for (int i = 0, l = bundle.dataLength(); i < l; ++i) {
        ClassLabel label = (ClassLabel) bundle.data(i, labelcol);
        if (label == null) {
            throw new AbortException("Unlabeled instances currently not supported.");
        }
        int classIndex = Collections.binarySearch(labels, label);
        if (classIndex < 0) {
            throw new AbortException("Label not in label list: " + label);
        }
        classBuckets[classIndex].add(i);
    }
    // TODO: shuffle the class buckets?
    sizes = new int[nfold];
    assignment = new int[bundle.dataLength()];
    for (IntArrayList bucket : classBuckets) {
        for (int i = 0; i < bucket.size(); i++) {
            assignment[bucket.getInt(i)] = i % nfold;
        }
    }
}
Also used : ClassLabel(de.lmu.ifi.dbs.elki.data.ClassLabel) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 68 with AbortException

use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.

the class BestOfMultipleKMeans method run.

@Override
public Clustering<M> run(Database database, Relation<V> relation) {
    if (!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) {
        throw new AbortException("K-Means results can only be evaluated for primitive distance functions, got: " + innerkMeans.getDistanceFunction().getClass());
    }
    @SuppressWarnings("unchecked") final NumberVectorDistanceFunction<? super NumberVector> df = (NumberVectorDistanceFunction<? super NumberVector>) innerkMeans.getDistanceFunction();
    Clustering<M> bestResult = null;
    double bestCost = Double.NaN;
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("K-means iterations", trials, LOG) : null;
    for (int i = 0; i < trials; i++) {
        Clustering<M> currentCandidate = innerkMeans.run(database, relation);
        double currentCost = qualityMeasure.quality(currentCandidate, df, relation);
        if (LOG.isVerbose()) {
            LOG.verbose("Cost of candidate " + i + ": " + currentCost);
        }
        if (qualityMeasure.isBetter(currentCost, bestCost)) {
            bestResult = currentCandidate;
            bestCost = currentCost;
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    return bestResult;
}
Also used : NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) NumberVectorDistanceFunction(de.lmu.ifi.dbs.elki.distance.distancefunction.NumberVectorDistanceFunction) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) PrimitiveDistanceFunction(de.lmu.ifi.dbs.elki.distance.distancefunction.PrimitiveDistanceFunction) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 69 with AbortException

use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.

the class CLARANS method run.

public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("CLARANS Clustering", "clarans-clustering");
    }
    if (k * 2 >= relation.size()) {
        // Random sampling of non-medoids will be slow for huge k
        LOG.warning("A very large k was chosen. This implementation is not optimized for this case.");
    }
    DBIDs ids = relation.getDBIDs();
    DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction());
    final boolean metric = getDistanceFunction().isMetric();
    // Number of retries, relative rate, or absolute count:
    final int retries = (int) Math.ceil(maxneighbor < 1 ? maxneighbor * ids.size() : maxneighbor);
    Random rnd = random.getSingleThreadedRandom();
    // Might copy!
    DBIDArrayIter cand = DBIDUtil.ensureArray(ids).iter();
    // Setup cluster assignment store
    Assignment best = new Assignment(distQ, ids, DBIDUtil.newArray(k));
    Assignment curr = new Assignment(distQ, ids, DBIDUtil.newArray(k));
    Assignment scratch = new Assignment(distQ, ids, DBIDUtil.newArray(k));
    // 1. initialize
    double bestscore = Double.POSITIVE_INFINITY;
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("CLARANS sampling restarts", numlocal, LOG) : null;
    for (int i = 0; i < numlocal; i++) {
        // 2. choose random initial medoids
        // TODO: should we always use uniform sampling, to be closer to the paper?
        curr.medoids.clear();
        curr.medoids.addDBIDs(DBIDUtil.randomSample(ids, k, random));
        // Cost of initial solution:
        double total = curr.assignToNearestCluster();
        // 3. Set j to 1.
        int j = 1;
        step: while (j < retries) {
            // 4 part a. choose a random non-medoid (~ neighbor in G):
            for (int r = 0; ; r++) {
                // Random point
                cand.seek(rnd.nextInt(ids.size()));
                if (curr.nearest.doubleValue(cand) > 0) {
                    // Good: not a medoid.
                    break;
                }
                // We may have many duplicate points
                if (metric && curr.second.doubleValue(cand) == 0) {
                    // Cannot yield an improvement if we are metric.
                    ++j;
                    continue step;
                } else if (!metric && !curr.medoids.contains(cand)) {
                    // Probably not a good candidate, but try nevertheless
                    break;
                }
                if (r >= 1000) {
                    throw new AbortException("Failed to choose a non-medoid in 1000 attempts. Choose k << N.");
                }
            // else: this must be the medoid.
            }
            // 4 part b. choose a random medoid to replace:
            final int otherm = rnd.nextInt(k);
            // 5. check lower cost
            double cost = curr.computeCostDifferential(cand, otherm, scratch);
            if (!(cost < 0)) {
                // 6. try again
                ++j;
                continue;
            }
            // cost is negative!
            total += cost;
            // Swap:
            Assignment tmp = curr;
            curr = scratch;
            scratch = tmp;
            j = 1;
        }
        // New best:
        if (total < bestscore) {
            // Swap:
            Assignment tmp = curr;
            curr = best;
            best = tmp;
            bestscore = total;
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    ArrayModifiableDBIDs[] clusters = ClusteringAlgorithmUtil.partitionsFromIntegerLabels(ids, best.assignment, k);
    // Wrap result
    Clustering<MedoidModel> result = new Clustering<>("CLARANS Clustering", "clarans-clustering");
    for (DBIDArrayIter it = best.medoids.iter(); it.valid(); it.advance()) {
        MedoidModel model = new MedoidModel(DBIDUtil.deref(it));
        result.addToplevelCluster(new Cluster<>(clusters[it.getOffset()], model));
    }
    return result;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) MedoidModel(de.lmu.ifi.dbs.elki.data.model.MedoidModel) Random(java.util.Random) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 70 with AbortException

use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.

the class PAMInitialMeans method chooseInitialMedoids.

@Override
public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distQ) {
    ArrayModifiableDBIDs medids = DBIDUtil.newArray(k);
    DBIDVar bestid = DBIDUtil.newVar();
    // We need three temporary storage arrays:
    WritableDoubleDataStore mindist, bestd, tempd;
    mindist = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
    bestd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
    tempd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
    // First mean is chosen by having the smallest distance sum to all others.
    {
        double best = Double.POSITIVE_INFINITY;
        FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Choosing initial mean", ids.size(), LOG) : null;
        for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
            double sum = 0, d;
            for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
                sum += d = distQ.distance(iter, iter2);
                tempd.putDouble(iter2, d);
            }
            if (sum < best) {
                best = sum;
                bestid.set(iter);
                // Swap mindist and newd:
                WritableDoubleDataStore temp = mindist;
                mindist = tempd;
                tempd = temp;
            }
            LOG.incrementProcessed(prog);
        }
        LOG.ensureCompleted(prog);
        medids.add(bestid);
    }
    assert (mindist != null);
    // Subsequent means optimize the full criterion.
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Choosing initial centers", k, LOG) : null;
    // First one was just chosen.
    LOG.incrementProcessed(prog);
    for (int i = 1; i < k; i++) {
        double best = Double.POSITIVE_INFINITY;
        bestid.unset();
        for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
            if (medids.contains(iter)) {
                continue;
            }
            double sum = 0., v;
            for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
                sum += v = MathUtil.min(distQ.distance(iter, iter2), mindist.doubleValue(iter2));
                tempd.put(iter2, v);
            }
            if (sum < best) {
                best = sum;
                bestid.set(iter);
                // Swap bestd and newd:
                WritableDoubleDataStore temp = bestd;
                bestd = tempd;
                tempd = temp;
            }
        }
        if (!bestid.isSet()) {
            throw new AbortException("No median found that improves the criterion function?!? Too many infinite distances.");
        }
        medids.add(bestid);
        // Swap bestd and mindist:
        WritableDoubleDataStore temp = bestd;
        bestd = mindist;
        mindist = temp;
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    mindist.destroy();
    bestd.destroy();
    tempd.destroy();
    return medids;
}
Also used : DBIDVar(de.lmu.ifi.dbs.elki.database.ids.DBIDVar) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Aggregations

AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)99 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)25 IOException (java.io.IOException)24 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)22 ArrayList (java.util.ArrayList)16 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)13 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)13 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)10 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)9 DoubleRelation (de.lmu.ifi.dbs.elki.database.relation.DoubleRelation)9 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)8 Model (de.lmu.ifi.dbs.elki.data.model.Model)8 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)8 Database (de.lmu.ifi.dbs.elki.database.Database)8 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)8 DBIDRange (de.lmu.ifi.dbs.elki.database.ids.DBIDRange)8 OutlierResult (de.lmu.ifi.dbs.elki.result.outlier.OutlierResult)8 MaterializedDoubleRelation (de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)6 ClassLabel (de.lmu.ifi.dbs.elki.data.ClassLabel)5 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)5