Search in sources :

Example 1 with Cluster

use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.

the class NaiveAgglomerativeHierarchicalClustering2 method run.

/**
 * Run the algorithm
 *
 * @param db Database
 * @param relation Relation
 * @return Clustering hierarchy
 */
public Result run(Database db, Relation<O> relation) {
    DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
    ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    final int size = ids.size();
    if (size > 0x10000) {
        throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow.");
    }
    LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
    // Compute the initial (lower triangular) distance matrix.
    double[] scratch = new double[triangleSize(size)];
    DBIDArrayIter ix = ids.iter(), iy = ids.iter();
    // Position counter - must agree with computeOffset!
    int pos = 0;
    for (int x = 0; ix.valid(); x++, ix.advance()) {
        iy.seek(0);
        for (int y = 0; y < x; y++, iy.advance()) {
            scratch[pos] = dq.distance(ix, iy);
            pos++;
        }
    }
    // Initialize space for result:
    double[] height = new double[size];
    Arrays.fill(height, Double.POSITIVE_INFINITY);
    // Parent node, to track merges
    // have every object point to itself initially
    ArrayModifiableDBIDs parent = DBIDUtil.newArray(ids);
    // Active clusters, when not trivial.
    Int2ReferenceMap<ModifiableDBIDs> clusters = new Int2ReferenceOpenHashMap<>();
    // Repeat until everything merged, except the desired number of clusters:
    final int stop = size - numclusters;
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", stop, LOG) : null;
    for (int i = 0; i < stop; i++) {
        double min = Double.POSITIVE_INFINITY;
        int minx = -1, miny = -1;
        for (int x = 0; x < size; x++) {
            if (height[x] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int xbase = triangleSize(x);
            for (int y = 0; y < x; y++) {
                if (height[y] < Double.POSITIVE_INFINITY) {
                    continue;
                }
                final int idx = xbase + y;
                if (scratch[idx] < min) {
                    min = scratch[idx];
                    minx = x;
                    miny = y;
                }
            }
        }
        assert (minx >= 0 && miny >= 0);
        // Avoid allocating memory, by reusing existing iterators:
        ix.seek(minx);
        iy.seek(miny);
        // Perform merge in data structure: x -> y
        // Since y < x, prefer keeping y, dropping x.
        height[minx] = min;
        parent.set(minx, iy);
        // Merge into cluster
        ModifiableDBIDs cx = clusters.get(minx);
        ModifiableDBIDs cy = clusters.get(miny);
        if (cy == null) {
            cy = DBIDUtil.newHashSet();
            cy.add(iy);
        }
        if (cx == null) {
            cy.add(ix);
        } else {
            cy.addDBIDs(cx);
            clusters.remove(minx);
        }
        clusters.put(miny, cy);
        // Update distance matrix. Note: miny < minx
        final int xbase = triangleSize(minx), ybase = triangleSize(miny);
        // Write to (y, j), with j < y
        for (int j = 0; j < miny; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            scratch[ybase + j] = Math.min(scratch[xbase + j], scratch[ybase + j]);
        }
        // Write to (j, y), with y < j < x
        for (int j = miny + 1; j < minx; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int jbase = triangleSize(j);
            scratch[jbase + miny] = Math.min(scratch[xbase + j], scratch[jbase + miny]);
        }
        // Write to (j, y), with y < x < j
        for (int j = minx + 1; j < size; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int jbase = triangleSize(j);
            scratch[jbase + miny] = Math.min(scratch[jbase + minx], scratch[jbase + miny]);
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    // Build the clustering result
    final Clustering<Model> dendrogram = new Clustering<>("Hierarchical-Clustering", "hierarchical-clustering");
    for (int x = 0; x < size; x++) {
        if (height[x] < Double.POSITIVE_INFINITY) {
            DBIDs cids = clusters.get(x);
            if (cids == null) {
                ix.seek(x);
                cids = DBIDUtil.deref(ix);
            }
            Cluster<Model> cluster = new Cluster<>("Cluster", cids);
            dendrogram.addToplevelCluster(cluster);
        }
    }
    return dendrogram;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) Model(de.lmu.ifi.dbs.elki.data.model.Model) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException) Int2ReferenceOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ReferenceOpenHashMap)

Example 2 with Cluster

use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.

the class XMeans method run.

/**
 * Run the algorithm on a database and relation.
 *
 * @param database Database to process
 * @param relation Data relation
 * @return Clustering result.
 */
@Override
public Clustering<M> run(Database database, Relation<V> relation) {
    MutableProgress prog = LOG.isVerbose() ? new MutableProgress("X-means number of clusters", k_max, LOG) : null;
    // Run initial k-means to find at least k_min clusters
    innerKMeans.setK(k_min);
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    splitInitializer.setInitialMeans(initializer.chooseInitialMeans(database, relation, k_min, getDistanceFunction()));
    Clustering<M> clustering = innerKMeans.run(database, relation);
    if (prog != null) {
        prog.setProcessed(k_min, LOG);
    }
    ArrayList<Cluster<M>> clusters = new ArrayList<>(clustering.getAllClusters());
    while (clusters.size() <= k_max) {
        // Improve-Structure:
        ArrayList<Cluster<M>> nextClusters = new ArrayList<>();
        for (Cluster<M> cluster : clusters) {
            // Try to split this cluster:
            List<Cluster<M>> childClusterList = splitCluster(cluster, database, relation);
            nextClusters.addAll(childClusterList);
            if (childClusterList.size() > 1) {
                k += childClusterList.size() - 1;
                if (prog != null) {
                    if (k >= k_max) {
                        prog.setTotal(k + 1);
                    }
                    prog.setProcessed(k, LOG);
                }
            }
        }
        if (clusters.size() == nextClusters.size()) {
            break;
        }
        // Improve-Params:
        splitInitializer.setInitialClusters(nextClusters);
        innerKMeans.setK(nextClusters.size());
        clustering = innerKMeans.run(database, relation);
        clusters.clear();
        clusters.addAll(clustering.getAllClusters());
    }
    // Ensure that the progress bar finished.
    if (prog != null) {
        prog.setTotal(k);
        prog.setProcessed(k, LOG);
    }
    if (LOG.isDebugging()) {
        LOG.debug("X-means returned k=" + k + " clusters.");
    }
    // add all current clusters to the result
    Clustering<M> result = new Clustering<>("X-Means Result", "X-Means", clusters);
    return result;
}
Also used : StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) MutableProgress(de.lmu.ifi.dbs.elki.logging.progress.MutableProgress) ArrayList(java.util.ArrayList) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering)

Example 3 with Cluster

use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.

the class LMCLUS method run.

/**
 * The main LMCLUS (Linear manifold clustering algorithm) is processed in this
 * method.
 *
 * <PRE>
 * The algorithm samples random linear manifolds and tries to find clusters in it.
 * It calculates a distance histogram searches for a threshold and partitions the
 * points in two groups the ones in the cluster and everything else.
 * Then the best fitting linear manifold is searched and registered as a cluster.
 * The process is started over until all points are clustered.
 * The last cluster should contain all the outliers. (or the whole data if no clusters have been found.)
 * For details see {@link LMCLUS}.
 * </PRE>
 *
 * @param database The database to operate on
 * @param relation Relation
 * @return Clustering result
 */
public Clustering<Model> run(Database database, Relation<NumberVector> relation) {
    Clustering<Model> ret = new Clustering<>("LMCLUS Clustering", "lmclus-clustering");
    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null;
    IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null;
    ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());
    Random r = rnd.getSingleThreadedRandom();
    final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation));
    int cnum = 0;
    while (unclustered.size() > minsize) {
        DBIDs current = unclustered;
        int lmDim = 1;
        for (int k = 1; k <= maxdim; k++) {
            // stopping at the appropriate dimensionality either.
            while (true) {
                Separation separation = findSeparation(relation, current, k, r);
                // " threshold: " + separation.threshold);
                if (separation.goodness <= sensitivityThreshold) {
                    break;
                }
                ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
                for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
                    if (deviation(minusEquals(relation.get(iter).toArray(), separation.originV), separation.basis) < separation.threshold) {
                        subset.add(iter);
                    }
                }
                // logger.verbose("size:"+subset.size());
                if (subset.size() < minsize) {
                    break;
                }
                current = subset;
                lmDim = k;
            // System.out.println("Partition: " + subset.size());
            }
        }
        // No more clusters found
        if (current.size() < minsize || current == unclustered) {
            break;
        }
        // New cluster found
        // TODO: annotate cluster with dimensionality
        final Cluster<Model> cluster = new Cluster<>(current);
        cluster.setName("Cluster_" + lmDim + "d_" + cnum);
        cnum++;
        ret.addToplevelCluster(cluster);
        // Remove from main working set.
        unclustered.removeDBIDs(current);
        if (progress != null) {
            progress.setProcessed(relation.size() - unclustered.size(), LOG);
        }
        if (cprogress != null) {
            cprogress.setProcessed(cnum, LOG);
        }
    }
    // Remaining objects are noise
    if (unclustered.size() > 0) {
        ret.addToplevelCluster(new Cluster<>(unclustered, true));
    }
    if (progress != null) {
        progress.setProcessed(relation.size(), LOG);
        progress.ensureCompleted(LOG);
    }
    LOG.setCompleted(cprogress);
    return ret;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) Random(java.util.Random) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) Model(de.lmu.ifi.dbs.elki.data.model.Model) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 4 with Cluster

use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.

the class KMeansBisecting method run.

@Override
public Clustering<M> run(Database database, Relation<V> relation) {
    ProxyDatabase proxyDB = new ProxyDatabase(relation.getDBIDs(), database);
    // Linked list is preferrable for scratch, as we will A) not need that many
    // clusters and B) be doing random removals of the largest cluster (often at
    // the head)
    LinkedList<Cluster<M>> currentClusterList = new LinkedList<>();
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Bisecting k-means", k - 1, LOG) : null;
    for (int j = 0; j < this.k - 1; j++) {
        // Choose a cluster to split and project database to cluster
        if (currentClusterList.isEmpty()) {
            proxyDB = new ProxyDatabase(relation.getDBIDs(), database);
        } else {
            Cluster<M> largestCluster = null;
            for (Cluster<M> cluster : currentClusterList) {
                if (largestCluster == null || cluster.size() > largestCluster.size()) {
                    largestCluster = cluster;
                }
            }
            currentClusterList.remove(largestCluster);
            proxyDB.setDBIDs(largestCluster.getIDs());
        }
        // Run the inner k-means algorithm:
        // FIXME: ensure we run on the correct relation in a multirelational
        // setting!
        Clustering<M> innerResult = innerkMeans.run(proxyDB);
        // Add resulting clusters to current result.
        currentClusterList.addAll(innerResult.getAllClusters());
        LOG.incrementProcessed(prog);
        if (LOG.isVerbose()) {
            LOG.verbose("Iteration " + j);
        }
    }
    LOG.ensureCompleted(prog);
    // add all current clusters to the result
    Clustering<M> result = new Clustering<>("Bisecting k-Means Result", "Bisecting-k-means");
    for (Cluster<M> cluster : currentClusterList) {
        result.addToplevelCluster(cluster);
    }
    return result;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) ProxyDatabase(de.lmu.ifi.dbs.elki.database.ProxyDatabase) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) LinkedList(java.util.LinkedList)

Example 5 with Cluster

use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.

the class DiSH method sortClusters.

/**
 * Returns a sorted list of the clusters w.r.t. the subspace dimensionality in
 * descending order.
 *
 * @param relation the database storing the objects
 * @param clustersMap the mapping of bits sets to clusters
 * @return a sorted list of the clusters
 */
private List<Cluster<SubspaceModel>> sortClusters(Relation<V> relation, Object2ObjectMap<long[], List<ArrayModifiableDBIDs>> clustersMap) {
    final int db_dim = RelationUtil.dimensionality(relation);
    // int num = 1;
    List<Cluster<SubspaceModel>> clusters = new ArrayList<>();
    for (long[] pv : clustersMap.keySet()) {
        List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv);
        for (int i = 0; i < parallelClusters.size(); i++) {
            ArrayModifiableDBIDs c = parallelClusters.get(i);
            Cluster<SubspaceModel> cluster = new Cluster<>(c);
            cluster.setModel(new SubspaceModel(new Subspace(pv), Centroid.make(relation, c).getArrayRef()));
            String subspace = BitsUtil.toStringLow(cluster.getModel().getSubspace().getDimensions(), db_dim);
            if (parallelClusters.size() > 1) {
                cluster.setName("Cluster_" + subspace + "_" + i);
            } else {
                cluster.setName("Cluster_" + subspace);
            }
            clusters.add(cluster);
        }
    }
    // sort the clusters w.r.t. lambda
    Comparator<Cluster<SubspaceModel>> comparator = new Comparator<Cluster<SubspaceModel>>() {

        @Override
        public int compare(Cluster<SubspaceModel> c1, Cluster<SubspaceModel> c2) {
            return c2.getModel().getSubspace().dimensionality() - c1.getModel().getSubspace().dimensionality();
        }
    };
    Collections.sort(clusters, comparator);
    return clusters;
}
Also used : ArrayList(java.util.ArrayList) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Comparator(java.util.Comparator) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) Subspace(de.lmu.ifi.dbs.elki.data.Subspace)

Aggregations

Cluster (de.lmu.ifi.dbs.elki.data.Cluster)38 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)21 Model (de.lmu.ifi.dbs.elki.data.model.Model)18 ArrayList (java.util.ArrayList)14 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)13 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)11 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)11 SubspaceModel (de.lmu.ifi.dbs.elki.data.model.SubspaceModel)8 Subspace (de.lmu.ifi.dbs.elki.data.Subspace)7 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)7 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)6 StepProgress (de.lmu.ifi.dbs.elki.logging.progress.StepProgress)6 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)5 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)4 HashMap (java.util.HashMap)4 ByLabelOrAllInOneClustering (de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering)3 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)3 ProxyDatabase (de.lmu.ifi.dbs.elki.database.ProxyDatabase)3 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)3 HashSetModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs)3