Search in sources :

Example 56 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class KMeansLloyd method run.

@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
    int iteration = 0;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
        LOG.incrementProcessed(prog);
        boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum);
        logVarstat(varstat, varsum);
        // Stop if no cluster assignment changed.
        if (!changed) {
            break;
        }
        // Recompute means.
        means = means(clusters, means, relation);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.size(); i++) {
        DBIDs ids = clusters.get(i);
        if (ids.size() == 0) {
            continue;
        }
        KMeansModel model = new KMeansModel(means[i], varsum[i]);
        result.addToplevelCluster(new Cluster<>(ids, model));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 57 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class KMeansMinusMinus method run.

@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    // Intialisieren der means
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // initialisieren vom Heap
    final int heapsize = (int) (rate < 1. ? Math.ceil(relation.size() * rate) : rate);
    DoubleMinHeap minHeap = new DoubleMinHeap(heapsize);
    // Setup cluster assignment store
    List<ModifiableDoubleDBIDList> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        clusters.add(DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat = new DoubleStatistic(this.getClass().getName() + ".variance-sum");
    // Otherwise, the vartotal break below will fail!
    assert (varstat != null);
    int iteration = 0;
    double prevvartotal = Double.POSITIVE_INFINITY;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
        minHeap.clear();
        for (int i = 0; i < k; i++) {
            clusters.get(i).clear();
        }
        LOG.incrementProcessed(prog);
        boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum, minHeap, heapsize);
        double vartotal = logVarstat(varstat, varsum);
        // than the previous value.
        if (!changed || vartotal > prevvartotal) {
            break;
        }
        prevvartotal = vartotal;
        // Recompute means.
        means = meansWithTreshhold(clusters, means, relation, heapsize > 0 ? minHeap.peek() : Double.POSITIVE_INFINITY);
    }
    // create noisecluster if wanted
    ModifiableDoubleDBIDList noiseids = null;
    if (noiseFlag && heapsize > 0) {
        clusters.add(noiseids = DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
        double tresh = minHeap.peek();
        for (int i = 0; i < k; i++) {
            for (DoubleDBIDListMIter it = clusters.get(i).iter(); it.valid(); it.advance()) {
                final double dist = it.doubleValue();
                // Add to the noise cluster:
                if (dist >= tresh) {
                    noiseids.add(dist, it);
                    assignment.putInt(it, k);
                    it.remove();
                }
            }
        }
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < k; i++) {
        DBIDs ids = clusters.get(i);
        if (ids.size() == 0) {
            continue;
        }
        KMeansModel model = new KMeansModel(means[i], varsum[i]);
        result.addToplevelCluster(new Cluster<>(ids, model));
    }
    // Noise Cluster
    if (noiseFlag) {
        KMeansModel model = new KMeansModel(null, 0);
        DBIDs ids = noiseids;
        if (ids.size() == 0) {
            return result;
        }
        result.addToplevelCluster(new Cluster<>(ids, true, model));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) DoubleMinHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap) ModifiableDoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) DoubleDBIDListMIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListMIter) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 58 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class XMeans method splitCluster.

/**
 * Conditionally splits the clusters based on the information criterion.
 *
 * @param parentCluster Cluster to split
 * @param database Database
 * @param relation Data relation
 * @return Parent cluster when split decreases clustering quality or child
 *         clusters when split improves clustering.
 */
protected List<Cluster<M>> splitCluster(Cluster<M> parentCluster, Database database, Relation<V> relation) {
    // Transform parent cluster into a clustering
    ArrayList<Cluster<M>> parentClusterList = new ArrayList<Cluster<M>>(1);
    parentClusterList.add(parentCluster);
    Clustering<M> parentClustering = new Clustering<>(parentCluster.getName(), parentCluster.getName(), parentClusterList);
    if (parentCluster.size() < 2) {
        // Split is not possbile
        return parentClusterList;
    }
    ProxyDatabase proxyDB = new ProxyDatabase(parentCluster.getIDs(), database);
    splitInitializer.setInitialMeans(splitCentroid(parentCluster, relation));
    innerKMeans.setK(2);
    Clustering<M> childClustering = innerKMeans.run(proxyDB);
    double parentEvaluation = informationCriterion.quality(parentClustering, getDistanceFunction(), relation);
    double childrenEvaluation = informationCriterion.quality(childClustering, getDistanceFunction(), relation);
    if (LOG.isDebugging()) {
        LOG.debug("parentEvaluation: " + parentEvaluation);
        LOG.debug("childrenEvaluation: " + childrenEvaluation);
    }
    // Check if split is an improvement:
    return (childrenEvaluation > parentEvaluation) ^ informationCriterion.ascending() ? parentClusterList : childClustering.getAllClusters();
}
Also used : ArrayList(java.util.ArrayList) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) ProxyDatabase(de.lmu.ifi.dbs.elki.database.ProxyDatabase) Clustering(de.lmu.ifi.dbs.elki.data.Clustering)

Example 59 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class KNNKernelDensityMinimaClustering method run.

/**
 * Run the clustering algorithm on a data relation.
 *
 * @param relation Relation
 * @return Clustering result
 */
public Clustering<ClusterModel> run(Relation<V> relation) {
    ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs());
    final int size = ids.size();
    // Sort by the sole dimension
    ids.sort(new VectorUtil.SortDBIDsBySingleDimension(relation, dim));
    // Density storage.
    WritableDoubleDataStore density = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.);
    DBIDArrayIter iter = ids.iter(), iter2 = ids.iter();
    StepProgress sprog = LOG.isVerbose() ? new StepProgress("Clustering steps", 2) : null;
    LOG.beginStep(sprog, 1, "Kernel density estimation.");
    {
        double[] scratch = new double[2 * k];
        iter.seek(0);
        for (int i = 0; i < size; i++, iter.advance()) {
            // Current value.
            final double curv = relation.get(iter).doubleValue(dim);
            final int pre = Math.max(i - k, 0), prek = i - pre;
            final int pos = Math.min(i + k, size - 1), posk = pos - i;
            iter2.seek(pre);
            for (int j = 0; j < prek; j++, iter2.advance()) {
                scratch[j] = curv - relation.get(iter2).doubleValue(dim);
            }
            assert (iter2.getOffset() == i);
            iter2.advance();
            for (int j = 0; j < posk; j++, iter2.advance()) {
                scratch[prek + j] = relation.get(iter2).doubleValue(dim) - curv;
            }
            assert (prek + posk >= k);
            double kdist = QuickSelect.quickSelect(scratch, 0, prek + posk, k);
            switch(mode) {
                case BALLOON:
                    {
                        double dens = 0.;
                        if (kdist > 0.) {
                            for (int j = 0; j < prek + posk; j++) {
                                dens += kernel.density(scratch[j] / kdist);
                            }
                        } else {
                            dens = Double.POSITIVE_INFINITY;
                        }
                        assert (iter.getOffset() == i);
                        density.putDouble(iter, dens);
                        break;
                    }
                case SAMPLE:
                    {
                        if (kdist > 0.) {
                            iter2.seek(pre);
                            for (int j = 0; j < prek; j++, iter2.advance()) {
                                double delta = curv - relation.get(iter2).doubleValue(dim);
                                density.putDouble(iter2, density.doubleValue(iter2) + kernel.density(delta / kdist));
                            }
                            assert (iter2.getOffset() == i);
                            iter2.advance();
                            for (int j = 0; j < posk; j++, iter2.advance()) {
                                double delta = relation.get(iter2).doubleValue(dim) - curv;
                                density.putDouble(iter2, density.doubleValue(iter2) + kernel.density(delta / kdist));
                            }
                        } else {
                            iter2.seek(pre);
                            for (int j = 0; j < prek; j++, iter2.advance()) {
                                double delta = curv - relation.get(iter2).doubleValue(dim);
                                if (!(delta > 0.)) {
                                    density.putDouble(iter2, Double.POSITIVE_INFINITY);
                                }
                            }
                            assert (iter2.getOffset() == i);
                            iter2.advance();
                            for (int j = 0; j < posk; j++, iter2.advance()) {
                                double delta = relation.get(iter2).doubleValue(dim) - curv;
                                if (!(delta > 0.)) {
                                    density.putDouble(iter2, Double.POSITIVE_INFINITY);
                                }
                            }
                        }
                        break;
                    }
                default:
                    throw new UnsupportedOperationException("Unknown mode specified.");
            }
        }
    }
    LOG.beginStep(sprog, 2, "Local minima detection.");
    Clustering<ClusterModel> clustering = new Clustering<>("onedimensional-kde-clustering", "One-Dimensional clustering using kernel density estimation.");
    {
        double[] scratch = new double[2 * minwindow + 1];
        int begin = 0;
        int halfw = (minwindow + 1) >> 1;
        iter.seek(0);
        // Fill initial buffer.
        for (int i = 0; i < size; i++, iter.advance()) {
            final int m = i % scratch.length, t = (i - minwindow - 1) % scratch.length;
            scratch[m] = density.doubleValue(iter);
            if (i > scratch.length) {
                double min = Double.POSITIVE_INFINITY;
                for (int j = 0; j < scratch.length; j++) {
                    if (j != t && scratch[j] < min) {
                        min = scratch[j];
                    }
                }
                // Local minimum:
                if (scratch[t] < min) {
                    int end = i - minwindow + 1;
                    {
                        // Test on which side the kNN is
                        iter2.seek(end);
                        double curv = relation.get(iter2).doubleValue(dim);
                        iter2.seek(end - halfw);
                        double left = relation.get(iter2).doubleValue(dim) - curv;
                        iter2.seek(end + halfw);
                        double right = curv - relation.get(iter2).doubleValue(dim);
                        if (left < right) {
                            end++;
                        }
                    }
                    iter2.seek(begin);
                    ArrayModifiableDBIDs cids = DBIDUtil.newArray(end - begin);
                    for (int j = 0; j < end - begin; j++, iter2.advance()) {
                        cids.add(iter2);
                    }
                    clustering.addToplevelCluster(new Cluster<>(cids, ClusterModel.CLUSTER));
                    begin = end;
                }
            }
        }
        // Extract last cluster
        int end = size;
        iter2.seek(begin);
        ArrayModifiableDBIDs cids = DBIDUtil.newArray(end - begin);
        for (int j = 0; j < end - begin; j++, iter2.advance()) {
            cids.add(iter2);
        }
        clustering.addToplevelCluster(new Cluster<>(cids, ClusterModel.CLUSTER));
    }
    LOG.ensureCompleted(sprog);
    return clustering;
}
Also used : WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) VectorUtil(de.lmu.ifi.dbs.elki.data.VectorUtil) ClusterModel(de.lmu.ifi.dbs.elki.data.model.ClusterModel) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)

Example 60 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class ByLabelClustering method run.

/**
 * Run the actual clustering algorithm.
 *
 * @param relation The data input we use
 */
public Clustering<Model> run(Relation<?> relation) {
    HashMap<String, DBIDs> labelMap = multiple ? multipleAssignment(relation) : singleAssignment(relation);
    ModifiableDBIDs noiseids = DBIDUtil.newArray();
    Clustering<Model> result = new Clustering<>("By Label Clustering", "bylabel-clustering");
    for (Entry<String, DBIDs> entry : labelMap.entrySet()) {
        DBIDs ids = entry.getValue();
        if (ids.size() <= 1) {
            noiseids.addDBIDs(ids);
            continue;
        }
        // Build a cluster
        Cluster<Model> c = new Cluster<Model>(entry.getKey(), ids, ClusterModel.CLUSTER);
        if (noisepattern != null && noisepattern.matcher(entry.getKey()).find()) {
            c.setNoise(true);
        }
        result.addToplevelCluster(c);
    }
    // Collected noise IDs.
    if (noiseids.size() > 0) {
        Cluster<Model> c = new Cluster<Model>("Noise", noiseids, ClusterModel.CLUSTER);
        c.setNoise(true);
        result.addToplevelCluster(c);
    }
    return result;
}
Also used : DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) ClusterModel(de.lmu.ifi.dbs.elki.data.model.ClusterModel) Model(de.lmu.ifi.dbs.elki.data.model.Model) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Clustering(de.lmu.ifi.dbs.elki.data.Clustering)

Aggregations

Clustering (de.lmu.ifi.dbs.elki.data.Clustering)68 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)32 ArrayList (java.util.ArrayList)27 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)23 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)21 Model (de.lmu.ifi.dbs.elki.data.model.Model)21 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)20 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)16 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)16 Database (de.lmu.ifi.dbs.elki.database.Database)14 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)14 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)14 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)14 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)13 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)12 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)12 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)9 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)8 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)6 MedoidModel (de.lmu.ifi.dbs.elki.data.model.MedoidModel)5