Search in sources :

Example 6 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class OPTICSCut method makeOPTICSCut.

/**
 * Compute an OPTICS cut clustering
 *
 * @param co Cluster order result
 * @param epsilon Epsilon value for cut
 * @return New partitioning clustering
 */
public static <E extends ClusterOrder> Clustering<Model> makeOPTICSCut(E co, double epsilon) {
    // Clustering model we are building
    Clustering<Model> clustering = new Clustering<>("OPTICS Cut Clustering", "optics-cut");
    // Collects noise elements
    ModifiableDBIDs noise = DBIDUtil.newHashSet();
    double lastDist = Double.MAX_VALUE;
    double actDist = Double.MAX_VALUE;
    // Current working set
    ModifiableDBIDs current = DBIDUtil.newHashSet();
    // TODO: can we implement this more nicely with a 1-lookahead?
    DBIDVar prev = DBIDUtil.newVar();
    for (DBIDIter it = co.iter(); it.valid(); prev.set(it), it.advance()) {
        lastDist = actDist;
        actDist = co.getReachability(it);
        if (actDist <= epsilon) {
            // the last element before the plot drops belongs to the cluster
            if (lastDist > epsilon && prev.isSet()) {
                // So un-noise it
                noise.remove(prev);
                // Add it to the cluster
                current.add(prev);
            }
            current.add(it);
        } else {
            // 'Finish' the previous cluster
            if (!current.isEmpty()) {
                // TODO: do we want a minpts restriction?
                // But we get have only core points guaranteed anyway.
                clustering.addToplevelCluster(new Cluster<Model>(current, ClusterModel.CLUSTER));
                current = DBIDUtil.newHashSet();
            }
            // Add to noise
            noise.add(it);
        }
    }
    // Any unfinished cluster will also be added
    if (!current.isEmpty()) {
        clustering.addToplevelCluster(new Cluster<Model>(current, ClusterModel.CLUSTER));
    }
    // Add noise
    clustering.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
    return clustering;
}
Also used : DBIDVar(de.lmu.ifi.dbs.elki.database.ids.DBIDVar) Model(de.lmu.ifi.dbs.elki.data.model.Model) ClusterModel(de.lmu.ifi.dbs.elki.data.model.ClusterModel) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 7 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class NaiveAgglomerativeHierarchicalClustering2 method run.

/**
 * Run the algorithm
 *
 * @param db Database
 * @param relation Relation
 * @return Clustering hierarchy
 */
public Result run(Database db, Relation<O> relation) {
    DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
    ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    final int size = ids.size();
    if (size > 0x10000) {
        throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow.");
    }
    LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
    // Compute the initial (lower triangular) distance matrix.
    double[] scratch = new double[triangleSize(size)];
    DBIDArrayIter ix = ids.iter(), iy = ids.iter();
    // Position counter - must agree with computeOffset!
    int pos = 0;
    for (int x = 0; ix.valid(); x++, ix.advance()) {
        iy.seek(0);
        for (int y = 0; y < x; y++, iy.advance()) {
            scratch[pos] = dq.distance(ix, iy);
            pos++;
        }
    }
    // Initialize space for result:
    double[] height = new double[size];
    Arrays.fill(height, Double.POSITIVE_INFINITY);
    // Parent node, to track merges
    // have every object point to itself initially
    ArrayModifiableDBIDs parent = DBIDUtil.newArray(ids);
    // Active clusters, when not trivial.
    Int2ReferenceMap<ModifiableDBIDs> clusters = new Int2ReferenceOpenHashMap<>();
    // Repeat until everything merged, except the desired number of clusters:
    final int stop = size - numclusters;
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", stop, LOG) : null;
    for (int i = 0; i < stop; i++) {
        double min = Double.POSITIVE_INFINITY;
        int minx = -1, miny = -1;
        for (int x = 0; x < size; x++) {
            if (height[x] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int xbase = triangleSize(x);
            for (int y = 0; y < x; y++) {
                if (height[y] < Double.POSITIVE_INFINITY) {
                    continue;
                }
                final int idx = xbase + y;
                if (scratch[idx] < min) {
                    min = scratch[idx];
                    minx = x;
                    miny = y;
                }
            }
        }
        assert (minx >= 0 && miny >= 0);
        // Avoid allocating memory, by reusing existing iterators:
        ix.seek(minx);
        iy.seek(miny);
        // Perform merge in data structure: x -> y
        // Since y < x, prefer keeping y, dropping x.
        height[minx] = min;
        parent.set(minx, iy);
        // Merge into cluster
        ModifiableDBIDs cx = clusters.get(minx);
        ModifiableDBIDs cy = clusters.get(miny);
        if (cy == null) {
            cy = DBIDUtil.newHashSet();
            cy.add(iy);
        }
        if (cx == null) {
            cy.add(ix);
        } else {
            cy.addDBIDs(cx);
            clusters.remove(minx);
        }
        clusters.put(miny, cy);
        // Update distance matrix. Note: miny < minx
        final int xbase = triangleSize(minx), ybase = triangleSize(miny);
        // Write to (y, j), with j < y
        for (int j = 0; j < miny; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            scratch[ybase + j] = Math.min(scratch[xbase + j], scratch[ybase + j]);
        }
        // Write to (j, y), with y < j < x
        for (int j = miny + 1; j < minx; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int jbase = triangleSize(j);
            scratch[jbase + miny] = Math.min(scratch[xbase + j], scratch[jbase + miny]);
        }
        // Write to (j, y), with y < x < j
        for (int j = minx + 1; j < size; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int jbase = triangleSize(j);
            scratch[jbase + miny] = Math.min(scratch[jbase + minx], scratch[jbase + miny]);
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    // Build the clustering result
    final Clustering<Model> dendrogram = new Clustering<>("Hierarchical-Clustering", "hierarchical-clustering");
    for (int x = 0; x < size; x++) {
        if (height[x] < Double.POSITIVE_INFINITY) {
            DBIDs cids = clusters.get(x);
            if (cids == null) {
                ix.seek(x);
                cids = DBIDUtil.deref(ix);
            }
            Cluster<Model> cluster = new Cluster<>("Cluster", cids);
            dendrogram.addToplevelCluster(cluster);
        }
    }
    return dendrogram;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) Model(de.lmu.ifi.dbs.elki.data.model.Model) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException) Int2ReferenceOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ReferenceOpenHashMap)

Example 8 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class XMeans method run.

/**
 * Run the algorithm on a database and relation.
 *
 * @param database Database to process
 * @param relation Data relation
 * @return Clustering result.
 */
@Override
public Clustering<M> run(Database database, Relation<V> relation) {
    MutableProgress prog = LOG.isVerbose() ? new MutableProgress("X-means number of clusters", k_max, LOG) : null;
    // Run initial k-means to find at least k_min clusters
    innerKMeans.setK(k_min);
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    splitInitializer.setInitialMeans(initializer.chooseInitialMeans(database, relation, k_min, getDistanceFunction()));
    Clustering<M> clustering = innerKMeans.run(database, relation);
    if (prog != null) {
        prog.setProcessed(k_min, LOG);
    }
    ArrayList<Cluster<M>> clusters = new ArrayList<>(clustering.getAllClusters());
    while (clusters.size() <= k_max) {
        // Improve-Structure:
        ArrayList<Cluster<M>> nextClusters = new ArrayList<>();
        for (Cluster<M> cluster : clusters) {
            // Try to split this cluster:
            List<Cluster<M>> childClusterList = splitCluster(cluster, database, relation);
            nextClusters.addAll(childClusterList);
            if (childClusterList.size() > 1) {
                k += childClusterList.size() - 1;
                if (prog != null) {
                    if (k >= k_max) {
                        prog.setTotal(k + 1);
                    }
                    prog.setProcessed(k, LOG);
                }
            }
        }
        if (clusters.size() == nextClusters.size()) {
            break;
        }
        // Improve-Params:
        splitInitializer.setInitialClusters(nextClusters);
        innerKMeans.setK(nextClusters.size());
        clustering = innerKMeans.run(database, relation);
        clusters.clear();
        clusters.addAll(clustering.getAllClusters());
    }
    // Ensure that the progress bar finished.
    if (prog != null) {
        prog.setTotal(k);
        prog.setProcessed(k, LOG);
    }
    if (LOG.isDebugging()) {
        LOG.debug("X-means returned k=" + k + " clusters.");
    }
    // add all current clusters to the result
    Clustering<M> result = new Clustering<>("X-Means Result", "X-Means", clusters);
    return result;
}
Also used : StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) MutableProgress(de.lmu.ifi.dbs.elki.logging.progress.MutableProgress) ArrayList(java.util.ArrayList) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering)

Example 9 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class ParallelLloydKMeans method run.

@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    DBIDs ids = relation.getDBIDs();
    // Choose initial means
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Store for current cluster assignment.
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];
    KMeansProcessor<V> kmm = new KMeansProcessor<>(relation, distanceFunction, assignment, varsum);
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
        LOG.incrementProcessed(prog);
        kmm.nextIteration(means);
        ParallelExecutor.run(ids, kmm);
        // Stop if no cluster assignment changed.
        if (!kmm.changed()) {
            break;
        }
        means = kmm.getMeans();
    }
    LOG.setCompleted(prog);
    // Wrap result
    ArrayModifiableDBIDs[] clusters = ClusteringAlgorithmUtil.partitionsFromIntegerLabels(ids, assignment, k);
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.length; i++) {
        DBIDs cids = clusters[i];
        if (cids.size() == 0) {
            continue;
        }
        KMeansModel model = new KMeansModel(means[i], varsum[i]);
        result.addToplevelCluster(new Cluster<>(cids, model));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)

Example 10 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class BIRCHLeafClustering method run.

/**
 * Run the clustering algorithm.
 *
 * @param relation Input data
 * @return Clustering
 */
public Clustering<MeanModel> run(Relation<NumberVector> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    CFTree tree = cffactory.newTree(relation.getDBIDs(), relation);
    // The CFTree does not store points. We have to reassign them (and the
    // quality is better than if we used the initial assignment, because centers
    // move in particular in the beginning, so we always had many outliers.
    Map<ClusteringFeature, ModifiableDBIDs> idmap = new HashMap<ClusteringFeature, ModifiableDBIDs>(tree.leaves);
    for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
        ClusteringFeature cf = tree.findLeaf(relation.get(iter));
        ModifiableDBIDs ids = idmap.get(cf);
        if (ids == null) {
            idmap.put(cf, ids = DBIDUtil.newArray(cf.n));
        }
        ids.add(iter);
    }
    Clustering<MeanModel> result = new Clustering<>("BIRCH-leaves", "BIRCH leaves");
    for (Map.Entry<ClusteringFeature, ModifiableDBIDs> ent : idmap.entrySet()) {
        ClusteringFeature leaf = ent.getKey();
        double[] center = new double[dim];
        for (int i = 0; i < dim; i++) {
            center[i] = leaf.centroid(i);
        }
        result.addToplevelCluster(new Cluster<>(ent.getValue(), new MeanModel(center)));
    }
    return result;
}
Also used : HashMap(java.util.HashMap) MeanModel(de.lmu.ifi.dbs.elki.data.model.MeanModel) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

Clustering (de.lmu.ifi.dbs.elki.data.Clustering)68 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)32 ArrayList (java.util.ArrayList)27 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)23 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)21 Model (de.lmu.ifi.dbs.elki.data.model.Model)21 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)20 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)16 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)16 Database (de.lmu.ifi.dbs.elki.database.Database)14 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)14 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)14 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)14 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)13 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)12 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)12 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)9 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)8 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)6 MedoidModel (de.lmu.ifi.dbs.elki.data.model.MedoidModel)5