Search in sources :

Example 1 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class OPTICSCut method makeOPTICSCut.

/**
 * Compute an OPTICS cut clustering
 *
 * @param co Cluster order result
 * @param epsilon Epsilon value for cut
 * @return New partitioning clustering
 */
public static <E extends ClusterOrder> Clustering<Model> makeOPTICSCut(E co, double epsilon) {
    // Clustering model we are building
    Clustering<Model> clustering = new Clustering<>("OPTICS Cut Clustering", "optics-cut");
    // Collects noise elements
    ModifiableDBIDs noise = DBIDUtil.newHashSet();
    double lastDist = Double.MAX_VALUE;
    double actDist = Double.MAX_VALUE;
    // Current working set
    ModifiableDBIDs current = DBIDUtil.newHashSet();
    // TODO: can we implement this more nicely with a 1-lookahead?
    DBIDVar prev = DBIDUtil.newVar();
    for (DBIDIter it = co.iter(); it.valid(); prev.set(it), it.advance()) {
        lastDist = actDist;
        actDist = co.getReachability(it);
        if (actDist <= epsilon) {
            // the last element before the plot drops belongs to the cluster
            if (lastDist > epsilon && prev.isSet()) {
                // So un-noise it
                noise.remove(prev);
                // Add it to the cluster
                current.add(prev);
            }
            current.add(it);
        } else {
            // 'Finish' the previous cluster
            if (!current.isEmpty()) {
                // TODO: do we want a minpts restriction?
                // But we get have only core points guaranteed anyway.
                clustering.addToplevelCluster(new Cluster<Model>(current, ClusterModel.CLUSTER));
                current = DBIDUtil.newHashSet();
            }
            // Add to noise
            noise.add(it);
        }
    }
    // Any unfinished cluster will also be added
    if (!current.isEmpty()) {
        clustering.addToplevelCluster(new Cluster<Model>(current, ClusterModel.CLUSTER));
    }
    // Add noise
    clustering.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
    return clustering;
}
Also used : DBIDVar(de.lmu.ifi.dbs.elki.database.ids.DBIDVar) Model(de.lmu.ifi.dbs.elki.data.model.Model) ClusterModel(de.lmu.ifi.dbs.elki.data.model.ClusterModel) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 2 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class SLINKHDBSCANLinearMemory method run.

/**
 * Run the algorithm
 *
 * @param db Database
 * @param relation Relation
 * @return Clustering hierarchy
 */
public PointerDensityHierarchyRepresentationResult run(Database db, Relation<O> relation) {
    final DistanceQuery<O> distQ = db.getDistanceQuery(relation, getDistanceFunction());
    final KNNQuery<O> knnQ = db.getKNNQuery(distQ, minPts);
    // We need array addressing later.
    final ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    // Compute the core distances
    // minPts + 1: ignore query point.
    final WritableDoubleDataStore coredists = computeCoreDists(ids, knnQ, minPts);
    WritableDBIDDataStore pi = DataStoreUtil.makeDBIDStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
    WritableDoubleDataStore lambda = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, Double.POSITIVE_INFINITY);
    // Temporary storage for m.
    WritableDoubleDataStore m = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Running HDBSCAN*-SLINK", ids.size(), LOG) : null;
    // has to be an array for monotonicity reasons!
    ModifiableDBIDs processedIDs = DBIDUtil.newArray(ids.size());
    for (DBIDIter id = ids.iter(); id.valid(); id.advance()) {
        // Steps 1,3,4 are exactly as in SLINK
        step1(id, pi, lambda);
        // Step 2 is modified to use a different distance
        step2(id, processedIDs, distQ, coredists, m);
        step3(id, pi, lambda, processedIDs, m);
        step4(id, pi, lambda, processedIDs);
        processedIDs.add(id);
        LOG.incrementProcessed(progress);
    }
    LOG.ensureCompleted(progress);
    return new PointerDensityHierarchyRepresentationResult(ids, pi, lambda, distQ.getDistanceFunction().isSquared(), coredists);
}
Also used : WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) WritableDBIDDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDBIDDataStore) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 3 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class BIRCHLeafClustering method run.

/**
 * Run the clustering algorithm.
 *
 * @param relation Input data
 * @return Clustering
 */
public Clustering<MeanModel> run(Relation<NumberVector> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    CFTree tree = cffactory.newTree(relation.getDBIDs(), relation);
    // The CFTree does not store points. We have to reassign them (and the
    // quality is better than if we used the initial assignment, because centers
    // move in particular in the beginning, so we always had many outliers.
    Map<ClusteringFeature, ModifiableDBIDs> idmap = new HashMap<ClusteringFeature, ModifiableDBIDs>(tree.leaves);
    for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
        ClusteringFeature cf = tree.findLeaf(relation.get(iter));
        ModifiableDBIDs ids = idmap.get(cf);
        if (ids == null) {
            idmap.put(cf, ids = DBIDUtil.newArray(cf.n));
        }
        ids.add(iter);
    }
    Clustering<MeanModel> result = new Clustering<>("BIRCH-leaves", "BIRCH leaves");
    for (Map.Entry<ClusteringFeature, ModifiableDBIDs> ent : idmap.entrySet()) {
        ClusteringFeature leaf = ent.getKey();
        double[] center = new double[dim];
        for (int i = 0; i < dim; i++) {
            center[i] = leaf.centroid(i);
        }
        result.addToplevelCluster(new Cluster<>(ent.getValue(), new MeanModel(center)));
    }
    return result;
}
Also used : HashMap(java.util.HashMap) MeanModel(de.lmu.ifi.dbs.elki.data.model.MeanModel) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class LMCLUS method run.

/**
 * The main LMCLUS (Linear manifold clustering algorithm) is processed in this
 * method.
 *
 * <PRE>
 * The algorithm samples random linear manifolds and tries to find clusters in it.
 * It calculates a distance histogram searches for a threshold and partitions the
 * points in two groups the ones in the cluster and everything else.
 * Then the best fitting linear manifold is searched and registered as a cluster.
 * The process is started over until all points are clustered.
 * The last cluster should contain all the outliers. (or the whole data if no clusters have been found.)
 * For details see {@link LMCLUS}.
 * </PRE>
 *
 * @param database The database to operate on
 * @param relation Relation
 * @return Clustering result
 */
public Clustering<Model> run(Database database, Relation<NumberVector> relation) {
    Clustering<Model> ret = new Clustering<>("LMCLUS Clustering", "lmclus-clustering");
    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null;
    IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null;
    ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());
    Random r = rnd.getSingleThreadedRandom();
    final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation));
    int cnum = 0;
    while (unclustered.size() > minsize) {
        DBIDs current = unclustered;
        int lmDim = 1;
        for (int k = 1; k <= maxdim; k++) {
            // stopping at the appropriate dimensionality either.
            while (true) {
                Separation separation = findSeparation(relation, current, k, r);
                // " threshold: " + separation.threshold);
                if (separation.goodness <= sensitivityThreshold) {
                    break;
                }
                ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
                for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
                    if (deviation(minusEquals(relation.get(iter).toArray(), separation.originV), separation.basis) < separation.threshold) {
                        subset.add(iter);
                    }
                }
                // logger.verbose("size:"+subset.size());
                if (subset.size() < minsize) {
                    break;
                }
                current = subset;
                lmDim = k;
            // System.out.println("Partition: " + subset.size());
            }
        }
        // No more clusters found
        if (current.size() < minsize || current == unclustered) {
            break;
        }
        // New cluster found
        // TODO: annotate cluster with dimensionality
        final Cluster<Model> cluster = new Cluster<>(current);
        cluster.setName("Cluster_" + lmDim + "d_" + cnum);
        cnum++;
        ret.addToplevelCluster(cluster);
        // Remove from main working set.
        unclustered.removeDBIDs(current);
        if (progress != null) {
            progress.setProcessed(relation.size() - unclustered.size(), LOG);
        }
        if (cprogress != null) {
            cprogress.setProcessed(cnum, LOG);
        }
    }
    // Remaining objects are noise
    if (unclustered.size() > 0) {
        ret.addToplevelCluster(new Cluster<>(unclustered, true));
    }
    if (progress != null) {
        progress.setProcessed(relation.size(), LOG);
        progress.ensureCompleted(LOG);
    }
    LOG.setCompleted(cprogress);
    return ret;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) Random(java.util.Random) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) Model(de.lmu.ifi.dbs.elki.data.model.Model) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 5 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class EM method run.

/**
 * Performs the EM clustering algorithm on the given database.
 *
 * Finally a hard clustering is provided where each clusters gets assigned the
 * points exhibiting the highest probability to belong to this cluster. But
 * still, the database objects hold associated the complete probability-vector
 * for all models.
 *
 * @param database Database
 * @param relation Relation
 * @return Result
 */
public Clustering<M> run(Database database, Relation<V> relation) {
    if (relation.size() == 0) {
        throw new IllegalArgumentException("database empty: must contain elements");
    }
    // initial models
    List<? extends EMClusterModel<M>> models = mfactory.buildInitialModels(database, relation, k, SquaredEuclideanDistanceFunction.STATIC);
    WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
    double loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
    DoubleStatistic likestat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".loglikelihood") : null;
    if (LOG.isStatistics()) {
        LOG.statistics(likestat.setDouble(loglikelihood));
    }
    // iteration unless no change
    int it = 0, lastimprovement = 0;
    // For detecting instabilities.
    double bestloglikelihood = loglikelihood;
    for (++it; it < maxiter || maxiter < 0; it++) {
        final double oldloglikelihood = loglikelihood;
        recomputeCovarianceMatrices(relation, probClusterIGivenX, models, prior);
        // reassign probabilities
        loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
        if (LOG.isStatistics()) {
            LOG.statistics(likestat.setDouble(loglikelihood));
        }
        if (loglikelihood - bestloglikelihood > delta) {
            lastimprovement = it;
            bestloglikelihood = loglikelihood;
        }
        if (Math.abs(loglikelihood - oldloglikelihood) <= delta || lastimprovement < it >> 1) {
            break;
        }
    }
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", it));
    }
    // fill result with clusters and models
    List<ModifiableDBIDs> hardClusters = new ArrayList<>(k);
    for (int i = 0; i < k; i++) {
        hardClusters.add(DBIDUtil.newArray());
    }
    // provide a hard clustering
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        hardClusters.get(argmax(probClusterIGivenX.get(iditer))).add(iditer);
    }
    Clustering<M> result = new Clustering<>("EM Clustering", "em-clustering");
    // provide models within the result
    for (int i = 0; i < k; i++) {
        result.addToplevelCluster(new Cluster<>(hardClusters.get(i), models.get(i).finalizeCluster()));
    }
    if (isSoft()) {
        result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs()));
    } else {
        probClusterIGivenX.destroy();
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Aggregations

ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)80 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)44 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)30 ArrayList (java.util.ArrayList)30 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)28 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)18 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)15 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)14 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)14 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)12 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)12 Model (de.lmu.ifi.dbs.elki.data.model.Model)11 DBID (de.lmu.ifi.dbs.elki.database.ids.DBID)11 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)10 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)10 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)9 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)9 HashSetModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs)8 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)8 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)7