Search in sources :

Example 1 with IndefiniteProgress

use of de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress in project elki by elki-project.

the class ParallelLloydKMeans method run.

@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    DBIDs ids = relation.getDBIDs();
    // Choose initial means
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Store for current cluster assignment.
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];
    KMeansProcessor<V> kmm = new KMeansProcessor<>(relation, distanceFunction, assignment, varsum);
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    for (int iteration = 0; maxiter <= 0 || iteration < maxiter; iteration++) {
        LOG.incrementProcessed(prog);
        kmm.nextIteration(means);
        ParallelExecutor.run(ids, kmm);
        // Stop if no cluster assignment changed.
        if (!kmm.changed()) {
            break;
        }
        means = kmm.getMeans();
    }
    LOG.setCompleted(prog);
    // Wrap result
    ArrayModifiableDBIDs[] clusters = ClusteringAlgorithmUtil.partitionsFromIntegerLabels(ids, assignment, k);
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.length; i++) {
        DBIDs cids = clusters[i];
        if (cids.size() == 0) {
            continue;
        }
        KMeansModel model = new KMeansModel(means[i], varsum[i]);
        result.addToplevelCluster(new Cluster<>(cids, model));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)

Example 2 with IndefiniteProgress

use of de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress in project elki by elki-project.

the class LMCLUS method run.

/**
 * The main LMCLUS (Linear manifold clustering algorithm) is processed in this
 * method.
 *
 * <PRE>
 * The algorithm samples random linear manifolds and tries to find clusters in it.
 * It calculates a distance histogram searches for a threshold and partitions the
 * points in two groups the ones in the cluster and everything else.
 * Then the best fitting linear manifold is searched and registered as a cluster.
 * The process is started over until all points are clustered.
 * The last cluster should contain all the outliers. (or the whole data if no clusters have been found.)
 * For details see {@link LMCLUS}.
 * </PRE>
 *
 * @param database The database to operate on
 * @param relation Relation
 * @return Clustering result
 */
public Clustering<Model> run(Database database, Relation<NumberVector> relation) {
    Clustering<Model> ret = new Clustering<>("LMCLUS Clustering", "lmclus-clustering");
    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Clustered objects", relation.size(), LOG) : null;
    IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Clusters found", LOG) : null;
    ModifiableDBIDs unclustered = DBIDUtil.newHashSet(relation.getDBIDs());
    Random r = rnd.getSingleThreadedRandom();
    final int maxdim = Math.min(maxLMDim, RelationUtil.dimensionality(relation));
    int cnum = 0;
    while (unclustered.size() > minsize) {
        DBIDs current = unclustered;
        int lmDim = 1;
        for (int k = 1; k <= maxdim; k++) {
            // stopping at the appropriate dimensionality either.
            while (true) {
                Separation separation = findSeparation(relation, current, k, r);
                // " threshold: " + separation.threshold);
                if (separation.goodness <= sensitivityThreshold) {
                    break;
                }
                ModifiableDBIDs subset = DBIDUtil.newArray(current.size());
                for (DBIDIter iter = current.iter(); iter.valid(); iter.advance()) {
                    if (deviation(minusEquals(relation.get(iter).toArray(), separation.originV), separation.basis) < separation.threshold) {
                        subset.add(iter);
                    }
                }
                // logger.verbose("size:"+subset.size());
                if (subset.size() < minsize) {
                    break;
                }
                current = subset;
                lmDim = k;
            // System.out.println("Partition: " + subset.size());
            }
        }
        // No more clusters found
        if (current.size() < minsize || current == unclustered) {
            break;
        }
        // New cluster found
        // TODO: annotate cluster with dimensionality
        final Cluster<Model> cluster = new Cluster<>(current);
        cluster.setName("Cluster_" + lmDim + "d_" + cnum);
        cnum++;
        ret.addToplevelCluster(cluster);
        // Remove from main working set.
        unclustered.removeDBIDs(current);
        if (progress != null) {
            progress.setProcessed(relation.size() - unclustered.size(), LOG);
        }
        if (cprogress != null) {
            cprogress.setProcessed(cnum, LOG);
        }
    }
    // Remaining objects are noise
    if (unclustered.size() > 0) {
        ret.addToplevelCluster(new Cluster<>(unclustered, true));
    }
    if (progress != null) {
        progress.setProcessed(relation.size(), LOG);
        progress.ensureCompleted(LOG);
    }
    LOG.setCompleted(cprogress);
    return ret;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) Random(java.util.Random) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) Model(de.lmu.ifi.dbs.elki.data.model.Model) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 3 with IndefiniteProgress

use of de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress in project elki by elki-project.

the class KMedoidsEM method run.

/**
 * Run k-medoids
 *
 * @param database Database
 * @param relation relation to use
 * @return result
 */
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
    }
    DistanceQuery<V> distQ = null;
    // Only enforce a distance matrix for PAM initialization, which is slow.
    if (initializer instanceof PAMInitialMeans) {
        distQ = DatabaseUtil.precomputedDistanceQuery(database, relation, getDistanceFunction(), LOG);
    } else {
        distQ = database.getDistanceQuery(relation, getDistanceFunction());
    }
    // Choose initial medoids
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, relation.getDBIDs(), distQ));
    DBIDArrayMIter miter = medoids.iter();
    double[] mdists = new double[k];
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        HashSetModifiableDBIDs set = DBIDUtil.newHashSet(relation.size() / k);
        // Add medoids.
        set.add(miter.seek(i));
        clusters.add(set);
    }
    // Initial assignment to nearest medoids
    // TODO: reuse this information, from the build phase, when possible?
    double tc = assignToNearestCluster(miter, mdists, clusters, distQ);
    if (LOG.isStatistics()) {
        LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + 0 + ".cost", tc));
    }
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids EM iteration", LOG) : null;
    // Swap phase
    int iteration = 0;
    DBIDVar best = DBIDUtil.newVar();
    while (true) {
        boolean changed = false;
        // Try to swap the medoid with a better cluster member:
        int i = 0;
        for (miter.seek(0); miter.valid(); miter.advance(), i++) {
            best.unset();
            double bestm = mdists[i];
            for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
                if (DBIDUtil.equal(miter, iter)) {
                    continue;
                }
                double sum = 0;
                for (DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
                    sum += distQ.distance(iter, iter2);
                }
                if (sum < bestm) {
                    best.set(iter);
                    bestm = sum;
                }
            }
            if (best.isSet() && !DBIDUtil.equal(miter, best)) {
                changed = true;
                assert (clusters.get(i).contains(best));
                medoids.set(i, best);
                mdists[i] = bestm;
            }
        }
        // Reassign
        if (!changed) {
            break;
        }
        double nc = assignToNearestCluster(miter, mdists, clusters, distQ);
        ++iteration;
        if (LOG.isStatistics()) {
            LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", nc));
        }
        LOG.incrementProcessed(prog);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    // Wrap result
    Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
    for (DBIDArrayIter it = medoids.iter(); it.valid(); it.advance()) {
        result.addToplevelCluster(new Cluster<>(clusters.get(it.getOffset()), new MedoidModel(DBIDUtil.deref(it))));
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) PAMInitialMeans(de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PAMInitialMeans) MedoidModel(de.lmu.ifi.dbs.elki.data.model.MedoidModel) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 4 with IndefiniteProgress

use of de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress in project elki by elki-project.

the class HiCS method calculateSubspaces.

/**
 * Identifies high contrast subspaces in a given full-dimensional database.
 *
 * @param relation the relation the HiCS should be evaluated for
 * @param subspaceIndex Subspace indexes
 * @return a set of high contrast subspaces
 */
private Set<HiCSSubspace> calculateSubspaces(Relation<? extends NumberVector> relation, ArrayList<ArrayDBIDs> subspaceIndex, Random random) {
    final int dbdim = RelationUtil.dimensionality(relation);
    FiniteProgress dprog = LOG.isVerbose() ? new FiniteProgress("Subspace dimensionality", dbdim, LOG) : null;
    if (dprog != null) {
        dprog.setProcessed(2, LOG);
    }
    TreeSet<HiCSSubspace> subspaceList = new TreeSet<>(HiCSSubspace.SORT_BY_SUBSPACE);
    TopBoundedHeap<HiCSSubspace> dDimensionalList = new TopBoundedHeap<>(cutoff, HiCSSubspace.SORT_BY_CONTRAST_ASC);
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Generating two-element subsets", (dbdim * (dbdim - 1)) >> 1, LOG) : null;
    // compute two-element sets of subspaces
    for (int i = 0; i < dbdim; i++) {
        for (int j = i + 1; j < dbdim; j++) {
            HiCSSubspace ts = new HiCSSubspace();
            ts.set(i);
            ts.set(j);
            calculateContrast(relation, ts, subspaceIndex, random);
            dDimensionalList.add(ts);
            LOG.incrementProcessed(prog);
        }
    }
    LOG.ensureCompleted(prog);
    IndefiniteProgress qprog = LOG.isVerbose() ? new IndefiniteProgress("Testing subspace candidates", LOG) : null;
    for (int d = 3; !dDimensionalList.isEmpty(); d++) {
        if (dprog != null) {
            dprog.setProcessed(d, LOG);
        }
        // result now contains all d-dimensional sets of subspaces
        ArrayList<HiCSSubspace> candidateList = new ArrayList<>(dDimensionalList.size());
        for (Heap<HiCSSubspace>.UnorderedIter it = dDimensionalList.unorderedIter(); it.valid(); it.advance()) {
            subspaceList.add(it.get());
            candidateList.add(it.get());
        }
        dDimensionalList.clear();
        // candidateList now contains the *m* best d-dimensional sets
        Collections.sort(candidateList, HiCSSubspace.SORT_BY_SUBSPACE);
        // TODO: optimize APRIORI style, by not even computing the bit set or?
        for (int i = 0; i < candidateList.size() - 1; i++) {
            for (int j = i + 1; j < candidateList.size(); j++) {
                HiCSSubspace set1 = candidateList.get(i);
                HiCSSubspace set2 = candidateList.get(j);
                HiCSSubspace joinedSet = new HiCSSubspace();
                joinedSet.or(set1);
                joinedSet.or(set2);
                if (joinedSet.cardinality() != d) {
                    continue;
                }
                calculateContrast(relation, joinedSet, subspaceIndex, random);
                dDimensionalList.add(joinedSet);
                LOG.incrementProcessed(qprog);
            }
        }
        // Prune
        for (HiCSSubspace cand : candidateList) {
            for (Heap<HiCSSubspace>.UnorderedIter it = dDimensionalList.unorderedIter(); it.valid(); it.advance()) {
                if (it.get().contrast > cand.contrast) {
                    subspaceList.remove(cand);
                    break;
                }
            }
        }
    }
    LOG.setCompleted(qprog);
    if (dprog != null) {
        dprog.setProcessed(dbdim, LOG);
        dprog.ensureCompleted(LOG);
    }
    return subspaceList;
}
Also used : TopBoundedHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.TopBoundedHeap) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) TreeSet(java.util.TreeSet) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) ArrayList(java.util.ArrayList) Heap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.Heap) TopBoundedHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.TopBoundedHeap)

Example 5 with IndefiniteProgress

use of de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress in project elki by elki-project.

the class DWOF method run.

/**
 * Performs the Generalized DWOF_SCORE algorithm on the given database by
 * calling all the other methods in the proper order.
 *
 * @param database Database to query
 * @param relation Data to process
 * @return new OutlierResult instance
 */
public OutlierResult run(Database database, Relation<O> relation) {
    final DBIDs ids = relation.getDBIDs();
    DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
    // Get k nearest neighbor and range query on the relation.
    KNNQuery<O> knnq = database.getKNNQuery(distFunc, k, DatabaseQuery.HINT_HEAVY_USE);
    RangeQuery<O> rnnQuery = database.getRangeQuery(distFunc, DatabaseQuery.HINT_HEAVY_USE);
    StepProgress stepProg = LOG.isVerbose() ? new StepProgress("DWOF", 2) : null;
    // DWOF output score storage.
    WritableDoubleDataStore dwofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_DB | DataStoreFactory.HINT_HOT, 0.);
    if (stepProg != null) {
        stepProg.beginStep(1, "Initializing objects' Radii", LOG);
    }
    WritableDoubleDataStore radii = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.);
    // Find an initial radius for each object:
    initializeRadii(ids, knnq, distFunc, radii);
    WritableIntegerDataStore oldSizes = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT, 1);
    WritableIntegerDataStore newSizes = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT, 1);
    int countUnmerged = relation.size();
    if (stepProg != null) {
        stepProg.beginStep(2, "Clustering-Evaluating Cycles.", LOG);
    }
    IndefiniteProgress clusEvalProgress = LOG.isVerbose() ? new IndefiniteProgress("Evaluating DWOFs", LOG) : null;
    while (countUnmerged > 0) {
        LOG.incrementProcessed(clusEvalProgress);
        // Increase radii
        for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
            radii.putDouble(iter, radii.doubleValue(iter) * delta);
        }
        // stores the clustering label for each object
        WritableDataStore<ModifiableDBIDs> labels = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_TEMP, ModifiableDBIDs.class);
        // Cluster objects based on the current radius
        clusterData(ids, rnnQuery, radii, labels);
        // simple reference swap
        WritableIntegerDataStore temp = newSizes;
        newSizes = oldSizes;
        oldSizes = temp;
        // Update the cluster size count for each object.
        countUnmerged = updateSizes(ids, labels, newSizes);
        labels.destroy();
        // Update DWOF scores.
        for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
            double newScore = (newSizes.intValue(iter) > 0) ? ((double) (oldSizes.intValue(iter) - 1) / (double) newSizes.intValue(iter)) : 0.0;
            dwofs.putDouble(iter, dwofs.doubleValue(iter) + newScore);
        }
    }
    LOG.setCompleted(clusEvalProgress);
    LOG.setCompleted(stepProg);
    // Build result representation.
    DoubleMinMax minmax = new DoubleMinMax();
    for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
        minmax.put(dwofs.doubleValue(iter));
    }
    OutlierScoreMeta meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY);
    DoubleRelation rel = new MaterializedDoubleRelation("Dynamic-Window Outlier Factors", "dwof-outlier", dwofs, ids);
    return new OutlierResult(meta, rel);
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) OutlierResult(de.lmu.ifi.dbs.elki.result.outlier.OutlierResult) InvertedOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) DoubleRelation(de.lmu.ifi.dbs.elki.database.relation.DoubleRelation) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation) OutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta) InvertedOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleMinMax(de.lmu.ifi.dbs.elki.math.DoubleMinMax) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)

Aggregations

IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)28 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)20 ArrayList (java.util.ArrayList)16 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)15 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)14 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)14 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)14 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)13 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)11 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)11 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)8 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)7 Model (de.lmu.ifi.dbs.elki.data.model.Model)4 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)4 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)4 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)3 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)3 AbstractProjectedClustering (de.lmu.ifi.dbs.elki.algorithm.clustering.AbstractProjectedClustering)2 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)2 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)2