Search in sources :

Example 11 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class LMCLUS method run.

/**
 * The main LMCLUS (Linear manifold clustering algorithm) is processed in this
 * method.
 *
 * <PRE>
 * The algorithm samples random linear manifolds and tries to find clusters in it.
 * It calculates a distance histogram searches for a threshold and partitions the
 * points in two groups the ones in the cluster and everything else.
 * Then the best fitting linear manifold is searched and registered as a cluster.
 * The process is started over until all points are clustered.
 * The last cluster should contain all the outliers. (or the whole data if no clusters have been found.)
 * For details see {@link LMCLUS}.
 * </PRE>
 *
 * @param database The database to operate on
 * @param relation Relation
 * @return Clustering result
 */
public Clustering<Model> run(Database database, Relation<NumberVector> relation) {
    Clustering<Model> ret = new Clustering<>("LMCLUS Clustering", "lmclus-clustering");
    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null;
    IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null;
    ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());
    Random r = rnd.getSingleThreadedRandom();
    final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation));
    int cnum = 0;
    while (unclustered.size() > minsize) {
        DBIDs current = unclustered;
        int lmDim = 1;
        for (int k = 1; k <= maxdim; k++) {
            // stopping at the appropriate dimensionality either.
            while (true) {
                Separation separation = findSeparation(relation, current, k, r);
                // " threshold: " + separation.threshold);
                if (separation.goodness <= sensitivityThreshold) {
                    break;
                }
                ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
                for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
                    if (deviation(minusEquals(relation.get(iter).toArray(), separation.originV), separation.basis) < separation.threshold) {
                        subset.add(iter);
                    }
                }
                // logger.verbose("size:"+subset.size());
                if (subset.size() < minsize) {
                    break;
                }
                current = subset;
                lmDim = k;
            // System.out.println("Partition: " + subset.size());
            }
        }
        // No more clusters found
        if (current.size() < minsize || current == unclustered) {
            break;
        }
        // New cluster found
        // TODO: annotate cluster with dimensionality
        final Cluster<Model> cluster = new Cluster<>(current);
        cluster.setName("Cluster_" + lmDim + "d_" + cnum);
        cnum++;
        ret.addToplevelCluster(cluster);
        // Remove from main working set.
        unclustered.removeDBIDs(current);
        if (progress != null) {
            progress.setProcessed(relation.size() - unclustered.size(), LOG);
        }
        if (cprogress != null) {
            cprogress.setProcessed(cnum, LOG);
        }
    }
    // Remaining objects are noise
    if (unclustered.size() > 0) {
        ret.addToplevelCluster(new Cluster<>(unclustered, true));
    }
    if (progress != null) {
        progress.setProcessed(relation.size(), LOG);
        progress.ensureCompleted(LOG);
    }
    LOG.setCompleted(cprogress);
    return ret;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) Random(java.util.Random) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) Model(de.lmu.ifi.dbs.elki.data.model.Model) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 12 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class EM method run.

/**
 * Performs the EM clustering algorithm on the given database.
 *
 * Finally a hard clustering is provided where each clusters gets assigned the
 * points exhibiting the highest probability to belong to this cluster. But
 * still, the database objects hold associated the complete probability-vector
 * for all models.
 *
 * @param database Database
 * @param relation Relation
 * @return Result
 */
public Clustering<M> run(Database database, Relation<V> relation) {
    if (relation.size() == 0) {
        throw new IllegalArgumentException("database empty: must contain elements");
    }
    // initial models
    List<? extends EMClusterModel<M>> models = mfactory.buildInitialModels(database, relation, k, SquaredEuclideanDistanceFunction.STATIC);
    WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
    double loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
    DoubleStatistic likestat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".loglikelihood") : null;
    if (LOG.isStatistics()) {
        LOG.statistics(likestat.setDouble(loglikelihood));
    }
    // iteration unless no change
    int it = 0, lastimprovement = 0;
    // For detecting instabilities.
    double bestloglikelihood = loglikelihood;
    for (++it; it < maxiter || maxiter < 0; it++) {
        final double oldloglikelihood = loglikelihood;
        recomputeCovarianceMatrices(relation, probClusterIGivenX, models, prior);
        // reassign probabilities
        loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
        if (LOG.isStatistics()) {
            LOG.statistics(likestat.setDouble(loglikelihood));
        }
        if (loglikelihood - bestloglikelihood > delta) {
            lastimprovement = it;
            bestloglikelihood = loglikelihood;
        }
        if (Math.abs(loglikelihood - oldloglikelihood) <= delta || lastimprovement < it >> 1) {
            break;
        }
    }
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", it));
    }
    // fill result with clusters and models
    List<ModifiableDBIDs> hardClusters = new ArrayList<>(k);
    for (int i = 0; i < k; i++) {
        hardClusters.add(DBIDUtil.newArray());
    }
    // provide a hard clustering
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        hardClusters.get(argmax(probClusterIGivenX.get(iditer))).add(iditer);
    }
    Clustering<M> result = new Clustering<>("EM Clustering", "em-clustering");
    // provide models within the result
    for (int i = 0; i < k; i++) {
        result.addToplevelCluster(new Cluster<>(hardClusters.get(i), models.get(i).finalizeCluster()));
    }
    if (isSoft()) {
        result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs()));
    } else {
        probClusterIGivenX.destroy();
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 13 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class KMedoidsEM method run.

/**
 * Run k-medoids
 *
 * @param database Database
 * @param relation relation to use
 * @return result
 */
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
    }
    DistanceQuery<V> distQ = null;
    // Only enforce a distance matrix for PAM initialization, which is slow.
    if (initializer instanceof PAMInitialMeans) {
        distQ = DatabaseUtil.precomputedDistanceQuery(database, relation, getDistanceFunction(), LOG);
    } else {
        distQ = database.getDistanceQuery(relation, getDistanceFunction());
    }
    // Choose initial medoids
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, relation.getDBIDs(), distQ));
    DBIDArrayMIter miter = medoids.iter();
    double[] mdists = new double[k];
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        HashSetModifiableDBIDs set = DBIDUtil.newHashSet(relation.size() / k);
        // Add medoids.
        set.add(miter.seek(i));
        clusters.add(set);
    }
    // Initial assignment to nearest medoids
    // TODO: reuse this information, from the build phase, when possible?
    double tc = assignToNearestCluster(miter, mdists, clusters, distQ);
    if (LOG.isStatistics()) {
        LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + 0 + ".cost", tc));
    }
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids EM iteration", LOG) : null;
    // Swap phase
    int iteration = 0;
    DBIDVar best = DBIDUtil.newVar();
    while (true) {
        boolean changed = false;
        // Try to swap the medoid with a better cluster member:
        int i = 0;
        for (miter.seek(0); miter.valid(); miter.advance(), i++) {
            best.unset();
            double bestm = mdists[i];
            for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
                if (DBIDUtil.equal(miter, iter)) {
                    continue;
                }
                double sum = 0;
                for (DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
                    sum += distQ.distance(iter, iter2);
                }
                if (sum < bestm) {
                    best.set(iter);
                    bestm = sum;
                }
            }
            if (best.isSet() && !DBIDUtil.equal(miter, best)) {
                changed = true;
                assert (clusters.get(i).contains(best));
                medoids.set(i, best);
                mdists[i] = bestm;
            }
        }
        // Reassign
        if (!changed) {
            break;
        }
        double nc = assignToNearestCluster(miter, mdists, clusters, distQ);
        ++iteration;
        if (LOG.isStatistics()) {
            LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", nc));
        }
        LOG.incrementProcessed(prog);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    // Wrap result
    Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
    for (DBIDArrayIter it = medoids.iter(); it.valid(); it.advance()) {
        result.addToplevelCluster(new Cluster<>(clusters.get(it.getOffset()), new MedoidModel(DBIDUtil.deref(it))));
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) PAMInitialMeans(de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PAMInitialMeans) MedoidModel(de.lmu.ifi.dbs.elki.data.model.MedoidModel) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 14 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class ChengAndChurch method biclustering.

@Override
public Clustering<BiclusterWithInversionsModel> biclustering() {
    double[][] mat = RelationUtil.relationAsMatrix(relation, rowIDs);
    BiclusterCandidate cand = new BiclusterCandidate(getRowDim(), getColDim());
    Clustering<BiclusterWithInversionsModel> result = new Clustering<>("Cheng-and-Church", "Cheng and Church Biclustering");
    ModifiableDBIDs noise = DBIDUtil.newHashSet(relation.getDBIDs());
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Extracting Cluster", n, LOG) : null;
    for (int i = 0; i < n; i++) {
        cand.reset();
        multipleNodeDeletion(mat, cand);
        if (LOG.isVeryVerbose()) {
            LOG.veryverbose("Residue after Alg 2: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
        }
        singleNodeDeletion(mat, cand);
        if (LOG.isVeryVerbose()) {
            LOG.veryverbose("Residue after Alg 1: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
        }
        nodeAddition(mat, cand);
        if (LOG.isVeryVerbose()) {
            LOG.veryverbose("Residue after Alg 3: " + cand.residue + " " + cand.rowcard + "x" + cand.colcard);
        }
        cand.maskMatrix(mat, dist);
        BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(cand.cols), rowsBitsetToIDs(cand.irow));
        final ArrayDBIDs cids = rowsBitsetToIDs(cand.rows);
        noise.removeDBIDs(cids);
        result.addToplevelCluster(new Cluster<>(cids, model));
        if (LOG.isVerbose()) {
            LOG.verbose("Score of bicluster " + (i + 1) + ": " + cand.residue + "\n");
            LOG.verbose("Number of rows: " + cand.rowcard + "\n");
            LOG.verbose("Number of columns: " + cand.colcard + "\n");
        // LOG.verbose("Total number of masked values: " + maskedVals.size() +
        // "\n");
        }
        LOG.incrementProcessed(prog);
    }
    // Add a noise cluster, full-dimensional.
    if (!noise.isEmpty()) {
        long[] allcols = BitsUtil.ones(getColDim());
        BiclusterWithInversionsModel model = new BiclusterWithInversionsModel(colsBitsetToIDs(allcols), DBIDUtil.EMPTYDBIDS);
        result.addToplevelCluster(new Cluster<>(noise, true, model));
    }
    LOG.ensureCompleted(prog);
    return result;
}
Also used : BiclusterWithInversionsModel(de.lmu.ifi.dbs.elki.data.model.BiclusterWithInversionsModel) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Clustering(de.lmu.ifi.dbs.elki.data.Clustering)

Example 15 with Clustering

use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.

the class KMeansBisecting method run.

@Override
public Clustering<M> run(Database database, Relation<V> relation) {
    ProxyDatabase proxyDB = new ProxyDatabase(relation.getDBIDs(), database);
    // Linked list is preferrable for scratch, as we will A) not need that many
    // clusters and B) be doing random removals of the largest cluster (often at
    // the head)
    LinkedList<Cluster<M>> currentClusterList = new LinkedList<>();
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Bisecting k-means", k - 1, LOG) : null;
    for (int j = 0; j < this.k - 1; j++) {
        // Choose a cluster to split and project database to cluster
        if (currentClusterList.isEmpty()) {
            proxyDB = new ProxyDatabase(relation.getDBIDs(), database);
        } else {
            Cluster<M> largestCluster = null;
            for (Cluster<M> cluster : currentClusterList) {
                if (largestCluster == null || cluster.size() > largestCluster.size()) {
                    largestCluster = cluster;
                }
            }
            currentClusterList.remove(largestCluster);
            proxyDB.setDBIDs(largestCluster.getIDs());
        }
        // Run the inner k-means algorithm:
        // FIXME: ensure we run on the correct relation in a multirelational
        // setting!
        Clustering<M> innerResult = innerkMeans.run(proxyDB);
        // Add resulting clusters to current result.
        currentClusterList.addAll(innerResult.getAllClusters());
        LOG.incrementProcessed(prog);
        if (LOG.isVerbose()) {
            LOG.verbose("Iteration " + j);
        }
    }
    LOG.ensureCompleted(prog);
    // add all current clusters to the result
    Clustering<M> result = new Clustering<>("Bisecting k-Means Result", "Bisecting-k-means");
    for (Cluster<M> cluster : currentClusterList) {
        result.addToplevelCluster(cluster);
    }
    return result;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) ProxyDatabase(de.lmu.ifi.dbs.elki.database.ProxyDatabase) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) LinkedList(java.util.LinkedList)

Aggregations

Clustering (de.lmu.ifi.dbs.elki.data.Clustering)68 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)32 ArrayList (java.util.ArrayList)27 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)23 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)21 Model (de.lmu.ifi.dbs.elki.data.model.Model)21 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)20 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)16 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)16 Database (de.lmu.ifi.dbs.elki.database.Database)14 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)14 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)14 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)14 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)13 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)12 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)12 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)9 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)8 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)6 MedoidModel (de.lmu.ifi.dbs.elki.data.model.MedoidModel)5