Search in sources :

Example 36 with ArrayModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs in project elki by elki-project.

the class SameSizeKMeansAlgorithm method refineResult.

/**
 * Perform k-means style iterations to improve the clustering result.
 *
 * @param relation Data relation
 * @param means Means list
 * @param clusters Cluster list
 * @param metas Metadata storage
 * @param tids DBIDs array
 * @return final means
 */
protected double[][] refineResult(Relation<V> relation, double[][] means, List<ModifiableDBIDs> clusters, final WritableDataStore<Meta> metas, ArrayModifiableDBIDs tids) {
    NumberVectorDistanceFunction<? super V> df = getDistanceFunction();
    // Our desired cluster size:
    // rounded down
    final int minsize = tids.size() / k;
    // rounded up
    final int maxsize = (tids.size() + k - 1) / k;
    // Comparator: sort by largest gain by transfer
    final Comparator<DBIDRef> comp = new Comparator<DBIDRef>() {

        @Override
        public int compare(DBIDRef o1, DBIDRef o2) {
            Meta c1 = metas.get(o1), c2 = metas.get(o2);
            return Double.compare(c1.priority(), c2.priority());
        }
    };
    // List for sorting cluster preferences
    final int[] preferences = MathUtil.sequence(0, k);
    // Comparator for this list.
    final PreferenceComparator pcomp = new PreferenceComparator();
    // Initialize transfer lists:
    ArrayModifiableDBIDs[] transfers = new ArrayModifiableDBIDs[k];
    for (int i = 0; i < k; i++) {
        transfers[i] = DBIDUtil.newArray();
    }
    DBIDArrayIter id = tids.iter();
    for (int iter = 0; maxiter <= 0 || iter < maxiter; iter++) {
        updateDistances(relation, means, metas, df);
        tids.sort(comp);
        // Track if anything has changed
        int active = 0;
        for (id.seek(0); id.valid(); id.advance()) {
            Meta c = metas.get(id);
            IntegerArrayQuickSort.sort(preferences, pcomp.select(c));
            ModifiableDBIDs source = clusters.get(c.primary);
            assert (source.contains(id));
            tloop: for (int i : preferences) {
                if (i == c.primary) {
                    // Already assigned here
                    continue;
                }
                ModifiableDBIDs dest = clusters.get(i);
                // Can we pair this transfer?
                final double gain = c.gain(i);
                for (DBIDMIter other = transfers[i].iter(); other.valid(); other.advance()) {
                    Meta c2 = metas.get(other);
                    if (gain + c2.gain(c.primary) > 0) {
                        transfer(metas, c2, dest, source, other, c.primary);
                        transfer(metas, c, source, dest, id, i);
                        active += 2;
                        // last, as this invalidates the reference!
                        other.remove();
                        // We are assigned here now.
                        source = dest;
                        // Can try another transfer, with next cluster.
                        continue tloop;
                    }
                }
                // If cluster sizes allow, move a single object.
                if (gain > 0 && (dest.size() < maxsize && source.size() > minsize)) {
                    transfer(metas, c, source, dest, id, i);
                    active += 1;
                    // We are assigned here now.
                    source = dest;
                    continue tloop;
                }
            }
            // transfer list.
            if (c.primary != preferences[0] && c.dists[c.primary] > c.dists[preferences[0]]) {
                transfers[c.primary].add(id);
            }
        }
        // TODO: try to get more transfers out of the transfer lists done by
        // considering more than one object?
        int pending = 0;
        // Clear transfer lists for next iteration.
        for (int i = 0; i < k; i++) {
            pending += transfers[i].size();
            transfers[i].clear();
        }
        if (LOG.isDebuggingFine()) {
            LOG.debugFine("Iteration #" + iter + ": performed " + active + " transfers skipped " + pending);
        }
        if (active <= 0) {
            break;
        }
        // Recompute means after reassignment
        means = means(clusters, means, relation);
    }
    return means;
}
Also used : DBIDMIter(de.lmu.ifi.dbs.elki.database.ids.DBIDMIter) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) IntegerComparator(de.lmu.ifi.dbs.elki.utilities.datastructures.arrays.IntegerComparator) Comparator(java.util.Comparator) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DBIDRef(de.lmu.ifi.dbs.elki.database.ids.DBIDRef) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 37 with ArrayModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs in project elki by elki-project.

the class SameSizeKMeansAlgorithm method run.

/**
 * Run k-means with cluster size constraints.
 *
 * @param database Database
 * @param relation relation to use
 * @return result
 */
@Override
public Clustering<MeanModel> run(Database database, Relation<V> relation) {
    // Database objects to process
    final DBIDs ids = relation.getDBIDs();
    // Choose initial means
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        clusters.add(DBIDUtil.newHashSet(relation.size() / k + 2));
    }
    // Meta data storage
    final WritableDataStore<Meta> metas = initializeMeta(relation, means);
    // Perform the initial assignment
    ArrayModifiableDBIDs tids = initialAssignment(clusters, metas, ids);
    // Recompute the means after the initial assignment
    means = means(clusters, means, relation);
    // Refine the result via k-means like iterations
    means = refineResult(relation, means, clusters, metas, tids);
    // Wrap result
    Clustering<MeanModel> result = new Clustering<>("k-Means Samesize Clustering", "kmeans-samesize-clustering");
    for (int i = 0; i < clusters.size(); i++) {
        result.addToplevelCluster(new Cluster<>(clusters.get(i), new MeanModel(means[i])));
    }
    return result;
}
Also used : ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) ArrayList(java.util.ArrayList) MeanModel(de.lmu.ifi.dbs.elki.data.model.MeanModel) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 38 with ArrayModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs in project elki by elki-project.

the class AbstractBiclustering method rowsBitsetToIDs.

/**
 * Convert a bitset into integer row ids.
 *
 * @param rows
 * @return integer row ids
 */
protected ArrayDBIDs rowsBitsetToIDs(long[] rows) {
    ArrayModifiableDBIDs rowIDs = DBIDUtil.newArray(BitsUtil.cardinality(rows));
    DBIDArrayIter iter = this.rowIDs.iter();
    outer: for (int rlpos = 0; rlpos < rows.length; ++rlpos) {
        long rlong = rows[rlpos];
        // Fast skip blocks of 64 masked values.
        if (rlong == 0L) {
            iter.advance(Long.SIZE);
            continue;
        }
        for (int i = 0; i < Long.SIZE; ++i, rlong >>>= 1, iter.advance()) {
            if (!iter.valid()) {
                break outer;
            }
            if ((rlong & 1L) == 1L) {
                rowIDs.add(iter);
            }
        }
    }
    return rowIDs;
}
Also used : ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)

Example 39 with ArrayModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs in project elki by elki-project.

the class NaiveMeanShiftClustering method run.

/**
 * Run the mean-shift clustering algorithm.
 *
 * @param database Database
 * @param relation Data relation
 * @return Clustering result
 */
public Clustering<MeanModel> run(Database database, Relation<V> relation) {
    final DistanceQuery<V> distq = database.getDistanceQuery(relation, getDistanceFunction());
    final RangeQuery<V> rangeq = database.getRangeQuery(distq);
    final NumberVector.Factory<V> factory = RelationUtil.getNumberVectorFactory(relation);
    final int dim = RelationUtil.dimensionality(relation);
    // Stopping threshold
    final double threshold = bandwidth * 1E-10;
    // Result store:
    ArrayList<Pair<V, ModifiableDBIDs>> clusters = new ArrayList<>();
    ModifiableDBIDs noise = DBIDUtil.newArray();
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Mean-shift clustering", relation.size(), LOG) : null;
    for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
        // Initial position:
        V position = relation.get(iter);
        iterations: for (int j = 1; j <= MAXITER; j++) {
            // Compute new position:
            V newvec = null;
            {
                DoubleDBIDList neigh = rangeq.getRangeForObject(position, bandwidth);
                boolean okay = (neigh.size() > 1) || (neigh.size() >= 1 && j > 1);
                if (okay) {
                    Centroid newpos = new Centroid(dim);
                    for (DoubleDBIDListIter niter = neigh.iter(); niter.valid(); niter.advance()) {
                        final double weight = kernel.density(niter.doubleValue() / bandwidth);
                        newpos.put(relation.get(niter), weight);
                    }
                    newvec = factory.newNumberVector(newpos.getArrayRef());
                // TODO: detect 0 weight!
                }
                if (!okay) {
                    noise.add(iter);
                    break iterations;
                }
            }
            // Test if we are close to one of the known clusters:
            double bestd = Double.POSITIVE_INFINITY;
            Pair<V, ModifiableDBIDs> bestp = null;
            for (Pair<V, ModifiableDBIDs> pair : clusters) {
                final double merged = distq.distance(newvec, pair.first);
                if (merged < bestd) {
                    bestd = merged;
                    bestp = pair;
                }
            }
            // Check for convergence:
            double delta = distq.distance(position, newvec);
            if (bestd < 10 * threshold || bestd * 2 < delta) {
                bestp.second.add(iter);
                break iterations;
            }
            if (j == MAXITER) {
                LOG.warning("No convergence after " + MAXITER + " iterations. Distance: " + delta);
            }
            if (Double.isNaN(delta)) {
                LOG.warning("Encountered NaN distance. Invalid center vector? " + newvec.toString());
                break iterations;
            }
            if (j == MAXITER || delta < threshold) {
                if (LOG.isDebuggingFine()) {
                    LOG.debugFine("New cluster:" + newvec + " delta: " + delta + " threshold: " + threshold + " bestd: " + bestd);
                }
                ArrayModifiableDBIDs cids = DBIDUtil.newArray();
                cids.add(iter);
                clusters.add(new Pair<V, ModifiableDBIDs>(newvec, cids));
                break iterations;
            }
            position = newvec;
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    ArrayList<Cluster<MeanModel>> cs = new ArrayList<>(clusters.size());
    for (Pair<V, ModifiableDBIDs> pair : clusters) {
        cs.add(new Cluster<>(pair.second, new MeanModel(pair.first.toArray())));
    }
    if (noise.size() > 0) {
        cs.add(new Cluster<MeanModel>(noise, true));
    }
    Clustering<MeanModel> c = new Clustering<>("Mean-shift Clustering", "mean-shift-clustering", cs);
    return c;
}
Also used : ArrayList(java.util.ArrayList) MeanModel(de.lmu.ifi.dbs.elki.data.model.MeanModel) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList) Pair(de.lmu.ifi.dbs.elki.utilities.pairs.Pair) DoubleDBIDListIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) Centroid(de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 40 with ArrayModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs in project elki by elki-project.

the class PROCLUS method greedy.

/**
 * Returns a piercing set of k medoids from the specified sample set.
 *
 * @param distFunc the distance function
 * @param sampleSet the sample set
 * @param m the number of medoids to be returned
 * @param random random number generator
 * @return a piercing set of m medoids from the specified sample set
 */
private ArrayDBIDs greedy(DistanceQuery<V> distFunc, DBIDs sampleSet, int m, Random random) {
    ArrayModifiableDBIDs medoids = DBIDUtil.newArray(m);
    ArrayModifiableDBIDs s = DBIDUtil.newArray(sampleSet);
    DBIDArrayIter iter = s.iter();
    DBIDVar m_i = DBIDUtil.newVar();
    int size = s.size();
    // Move a random element to the end, then pop()
    s.swap(random.nextInt(size), --size);
    medoids.add(s.pop(m_i));
    if (LOG.isDebugging()) {
        LOG.debugFiner("medoids " + medoids.toString());
    }
    // To track the current worst element:
    int worst = -1;
    double worstd = Double.NEGATIVE_INFINITY;
    // compute distances between each point in S and m_i
    WritableDoubleDataStore distances = DataStoreUtil.makeDoubleStorage(s, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
    for (iter.seek(0); iter.getOffset() < size; iter.advance()) {
        final double dist = distFunc.distance(iter, m_i);
        distances.putDouble(iter, dist);
        if (dist > worstd) {
            worstd = dist;
            worst = iter.getOffset();
        }
    }
    for (int i = 1; i < m; i++) {
        // choose medoid m_i to be far from previous medoids
        s.swap(worst, --size);
        medoids.add(s.pop(m_i));
        // compute distances of each point to closest medoid; track worst.
        worst = -1;
        worstd = Double.NEGATIVE_INFINITY;
        for (iter.seek(0); iter.getOffset() < size; iter.advance()) {
            double dist_new = distFunc.distance(iter, m_i);
            double dist_old = distances.doubleValue(iter);
            double dist = (dist_new < dist_old) ? dist_new : dist_old;
            distances.putDouble(iter, dist);
            if (dist > worstd) {
                worstd = dist;
                worst = iter.getOffset();
            }
        }
        if (LOG.isDebugging()) {
            LOG.debugFiner("medoids " + medoids.toString());
        }
    }
    return medoids;
}
Also used : DBIDVar(de.lmu.ifi.dbs.elki.database.ids.DBIDVar) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)

Aggregations

ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)49 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)23 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)12 ArrayList (java.util.ArrayList)11 DBIDVar (de.lmu.ifi.dbs.elki.database.ids.DBIDVar)10 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)9 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)8 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)7 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)6 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)6 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)5 DBIDRef (de.lmu.ifi.dbs.elki.database.ids.DBIDRef)5 Relation (de.lmu.ifi.dbs.elki.database.relation.Relation)5 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)4 SortDBIDsBySingleDimension (de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension)4 DoubleDBIDListIter (de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter)4 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)4 ModifiableDoubleDBIDList (de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList)4 SetDBIDs (de.lmu.ifi.dbs.elki.database.ids.SetDBIDs)4 Pair (de.lmu.ifi.dbs.elki.utilities.pairs.Pair)4