Search in sources :

Example 36 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class CLIQUE method run.

/**
 * Performs the CLIQUE algorithm on the given database.
 *
 * @param relation Data relation to process
 * @return Clustering result
 */
public Clustering<SubspaceModel> run(Relation<V> relation) {
    final int dimensionality = RelationUtil.dimensionality(relation);
    StepProgress step = new StepProgress(2);
    // 1. Identification of subspaces that contain clusters
    step.beginStep(1, "Identification of subspaces that contain clusters", LOG);
    ArrayList<List<CLIQUESubspace<V>>> dimensionToDenseSubspaces = new ArrayList<>(dimensionality);
    List<CLIQUESubspace<V>> denseSubspaces = findOneDimensionalDenseSubspaces(relation);
    dimensionToDenseSubspaces.add(denseSubspaces);
    if (LOG.isVerbose()) {
        LOG.verbose("1-dimensional dense subspaces: " + denseSubspaces.size());
    }
    if (LOG.isDebugging()) {
        for (CLIQUESubspace<V> s : denseSubspaces) {
            LOG.debug(s.toString(" "));
        }
    }
    for (int k = 2; k <= dimensionality && !denseSubspaces.isEmpty(); k++) {
        denseSubspaces = findDenseSubspaces(relation, denseSubspaces);
        assert (dimensionToDenseSubspaces.size() == k - 1);
        dimensionToDenseSubspaces.add(denseSubspaces);
        if (LOG.isVerbose()) {
            LOG.verbose(k + "-dimensional dense subspaces: " + denseSubspaces.size());
        }
        if (LOG.isDebugging()) {
            for (CLIQUESubspace<V> s : denseSubspaces) {
                LOG.debug(s.toString(" "));
            }
        }
    }
    // 2. Identification of clusters
    step.beginStep(2, "Identification of clusters", LOG);
    // build result
    Clustering<SubspaceModel> result = new Clustering<>("CLIQUE clustering", "clique-clustering");
    for (int dim = 0; dim < dimensionToDenseSubspaces.size(); dim++) {
        List<CLIQUESubspace<V>> subspaces = dimensionToDenseSubspaces.get(dim);
        List<Pair<Subspace, ModifiableDBIDs>> modelsAndClusters = determineClusters(subspaces);
        if (LOG.isVerbose()) {
            LOG.verbose((dim + 1) + "-dimensional clusters: " + modelsAndClusters.size());
        }
        for (Pair<Subspace, ModifiableDBIDs> modelAndCluster : modelsAndClusters) {
            Cluster<SubspaceModel> newCluster = new Cluster<>(modelAndCluster.second);
            newCluster.setModel(new SubspaceModel(modelAndCluster.first, Centroid.make(relation, modelAndCluster.second).getArrayRef()));
            result.addToplevelCluster(newCluster);
        }
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) CLIQUESubspace(de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUESubspace) CLIQUESubspace(de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUESubspace) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) ArrayList(java.util.ArrayList) List(java.util.List) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Pair(de.lmu.ifi.dbs.elki.utilities.pairs.Pair)

Example 37 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class KMeansBatchedLloyd method run.

@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    // Choose initial means
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initializer", initializer.toString()));
    }
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    ArrayDBIDs[] parts = DBIDUtil.randomSplit(relation.getDBIDs(), blocks, random);
    double[][] meanshift = new double[k][dim];
    int[] changesize = new int[k];
    double[] varsum = new double[k];
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
    int iteration = 0;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
        LOG.incrementProcessed(prog);
        boolean changed = false;
        FiniteProgress pprog = LOG.isVerbose() ? new FiniteProgress("Batch", parts.length, LOG) : null;
        for (int p = 0; p < parts.length; p++) {
            // Initialize new means scratch space.
            for (int i = 0; i < k; i++) {
                Arrays.fill(meanshift[i], 0.);
            }
            Arrays.fill(changesize, 0);
            Arrays.fill(varsum, 0.);
            changed |= assignToNearestCluster(relation, parts[p], means, meanshift, changesize, clusters, assignment, varsum);
            // Recompute means.
            updateMeans(means, meanshift, clusters, changesize);
            LOG.incrementProcessed(pprog);
        }
        LOG.ensureCompleted(pprog);
        logVarstat(varstat, varsum);
        // Stop if no cluster assignment changed.
        if (!changed) {
            break;
        }
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.size(); i++) {
        DBIDs ids = clusters.get(i);
        if (ids.size() == 0) {
            continue;
        }
        KMeansModel model = new KMeansModel(means[i], varsum[i]);
        result.addToplevelCluster(new Cluster<>(ids, model));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 38 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class KMeansHamerly method initialAssignToNearestCluster.

/**
 * Reassign objects, but only if their bounds indicate it is necessary to do
 * so.
 *
 * @param relation Data
 * @param means Current means
 * @param sums Running sums of the new means
 * @param clusters Current clusters
 * @param assignment Cluster assignment
 * @param upper Upper bounds
 * @param lower Lower bounds
 * @return true when the object was reassigned
 */
private int initialAssignToNearestCluster(Relation<V> relation, double[][] means, double[][] sums, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, WritableDoubleDataStore upper, WritableDoubleDataStore lower) {
    assert (k == means.length);
    boolean issquared = distanceFunction.isSquared();
    for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
        V fv = relation.get(it);
        // Find closest center, and distance to two closest centers
        double min1 = Double.POSITIVE_INFINITY, min2 = Double.POSITIVE_INFINITY;
        int minIndex = -1;
        for (int i = 0; i < k; i++) {
            double dist = distanceFunction.distance(fv, DoubleVector.wrap(means[i]));
            if (dist < min1) {
                minIndex = i;
                min2 = min1;
                min1 = dist;
            } else if (dist < min2) {
                min2 = dist;
            }
        }
        // make squared Euclidean a metric:
        if (issquared) {
            min1 = FastMath.sqrt(min1);
            min2 = FastMath.sqrt(min2);
        }
        ModifiableDBIDs newc = clusters.get(minIndex);
        newc.add(it);
        assignment.putInt(it, minIndex);
        double[] newmean = sums[minIndex];
        for (int d = 0; d < fv.getDimensionality(); d++) {
            newmean[d] += fv.doubleValue(d);
        }
        upper.putDouble(it, min1);
        lower.putDouble(it, min2);
    }
    return relation.size();
}
Also used : ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 39 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class KMediansLloyd method run.

@Override
public Clustering<MeanModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Medians Clustering", "kmedians-clustering");
    }
    // Choose initial medians
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    double[][] medians = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] distsum = new double[k];
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medians iteration", LOG) : null;
    int iteration = 0;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
        LOG.incrementProcessed(prog);
        boolean changed = assignToNearestCluster(relation, medians, clusters, assignment, distsum);
        // Stop if no cluster assignment changed.
        if (!changed) {
            break;
        }
        // Recompute medians.
        medians = medians(clusters, medians, relation);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    // Wrap result
    Clustering<MeanModel> result = new Clustering<>("k-Medians Clustering", "kmedians-clustering");
    for (int i = 0; i < clusters.size(); i++) {
        MeanModel model = new MeanModel(medians[i]);
        result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) ArrayList(java.util.ArrayList) MeanModel(de.lmu.ifi.dbs.elki.data.model.MeanModel) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 40 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class SingleAssignmentKMeans method run.

@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Means Assignment", "kmeans-assignment");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];
    assignToNearestCluster(relation, means, clusters, assignment, varsum);
    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("Nearest Centroid Clustering", "nearest-center-clustering");
    for (int i = 0; i < clusters.size(); i++) {
        KMeansModel model = new KMeansModel(means[i], varsum[i]);
        result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
    }
    return result;
}
Also used : StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) ArrayList(java.util.ArrayList) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Clustering(de.lmu.ifi.dbs.elki.data.Clustering)

Aggregations

ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)80 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)44 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)30 ArrayList (java.util.ArrayList)30 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)28 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)18 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)15 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)14 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)14 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)12 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)12 Model (de.lmu.ifi.dbs.elki.data.model.Model)11 DBID (de.lmu.ifi.dbs.elki.database.ids.DBID)11 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)10 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)10 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)9 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)9 HashSetModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs)8 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)8 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)7