Search in sources :

Example 86 with DoubleVector

use of de.lmu.ifi.dbs.elki.data.DoubleVector in project elki by elki-project.

the class KMeansElkan method run.

@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    // Elkan bounds
    WritableDoubleDataStore upper = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, Double.POSITIVE_INFINITY);
    WritableDataStore<double[]> lower = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, double[].class);
    for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
        // Filled with 0.
        lower.put(it, new double[k]);
    }
    // Storage for updated means:
    final int dim = means[0].length;
    double[][] sums = new double[k][dim];
    // Cluster separation
    double[] sep = new double[k];
    // Cluster distances
    double[][] cdist = new double[k][k];
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    LongStatistic rstat = LOG.isStatistics() ? new LongStatistic(this.getClass().getName() + ".reassignments") : null;
    int iteration = 0;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
        LOG.incrementProcessed(prog);
        int changed;
        if (iteration == 0) {
            changed = initialAssignToNearestCluster(relation, means, sums, clusters, assignment, upper, lower);
        } else {
            // #1
            recomputeSeperation(means, sep, cdist);
            changed = assignToNearestCluster(relation, means, sums, clusters, assignment, sep, cdist, upper, lower);
        }
        if (rstat != null) {
            rstat.setLong(changed);
            LOG.statistics(rstat);
        }
        // Stop if no cluster assignment changed.
        if (changed == 0) {
            break;
        }
        // Recompute means.
        for (int i = 0; i < k; i++) {
            final int s = clusters.get(i).size();
            timesEquals(sums[i], s > 0 ? 1. / s : 1.);
        }
        // Overwrites sep
        maxMoved(means, sums, sep);
        updateBounds(relation, assignment, upper, lower, sep);
        for (int i = 0; i < k; i++) {
            final int s = clusters.get(i).size();
            System.arraycopy(sums[i], 0, means[i], 0, dim);
            // Restore to sum for next iteration
            timesEquals(sums[i], s > 0 ? s : 1.);
        }
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    upper.destroy();
    lower.destroy();
    // Wrap result
    double totalvariance = 0.;
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.size(); i++) {
        DBIDs ids = clusters.get(i);
        if (ids.size() == 0) {
            continue;
        }
        double[] mean = means[i];
        double varsum = 0.;
        if (varstat) {
            DoubleVector mvec = DoubleVector.wrap(mean);
            for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
                varsum += distanceFunction.distance(mvec, relation.get(it));
            }
            totalvariance += varsum;
        }
        KMeansModel model = new KMeansModel(mean, varsum);
        result.addToplevelCluster(new Cluster<>(ids, model));
    }
    if (LOG.isStatistics() && varstat) {
        LOG.statistics(new DoubleStatistic(this.getClass().getName() + ".variance-sum", totalvariance));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector)

Example 87 with DoubleVector

use of de.lmu.ifi.dbs.elki.data.DoubleVector in project elki by elki-project.

the class KMeansSort method recomputeSeperation.

/**
 * Recompute the separation of cluster means.
 *
 * @param means Means
 * @param cdist Center-to-Center distances
 * @param cnum Center numbers
 * @param diststat Distance counting statistic
 */
private void recomputeSeperation(double[][] means, double[][] cdist, int[][] cnum, LongStatistic diststat) {
    final int k = means.length;
    for (int i = 1; i < k; i++) {
        DoubleVector mi = DoubleVector.wrap(means[i]);
        for (int j = 0; j < i; j++) {
            cdist[i][j] = cdist[j][i] = distanceFunction.distance(mi, DoubleVector.wrap(means[j]));
        }
    }
    double[] buf = new double[k - 1];
    for (int i = 0; i < k; i++) {
        System.arraycopy(cdist[i], 0, buf, 0, i);
        System.arraycopy(cdist[i], i + 1, buf, i, k - i - 1);
        for (int j = 0; j < buf.length; j++) {
            cnum[i][j] = j < i ? j : (j + 1);
        }
        DoubleIntegerArrayQuickSort.sort(buf, cnum[i], k - 1);
    }
    if (diststat != null) {
        diststat.increment((k * (k - 1)) >> 1);
    }
}
Also used : DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector)

Example 88 with DoubleVector

use of de.lmu.ifi.dbs.elki.data.DoubleVector in project elki by elki-project.

the class KMeansHamerly method run.

@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    // Hamerly bounds
    WritableDoubleDataStore upper = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, Double.POSITIVE_INFINITY);
    WritableDoubleDataStore lower = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, 0.);
    // Storage for updated means:
    final int dim = means[0].length;
    double[][] sums = new double[k][dim];
    // Separation of means / distance moved.
    double[] sep = new double[k];
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    LongStatistic rstat = LOG.isStatistics() ? new LongStatistic(KEY + ".reassignments") : null;
    int iteration = 0;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
        LOG.incrementProcessed(prog);
        int changed;
        if (iteration == 0) {
            changed = initialAssignToNearestCluster(relation, means, sums, clusters, assignment, upper, lower);
        } else {
            recomputeSeperation(means, sep);
            changed = assignToNearestCluster(relation, means, sums, clusters, assignment, sep, upper, lower);
        }
        if (rstat != null) {
            rstat.setLong(changed);
            LOG.statistics(rstat);
        }
        // Stop if no cluster assignment changed.
        if (changed == 0) {
            break;
        }
        // Recompute means.
        for (int i = 0; i < k; i++) {
            final int s = clusters.get(i).size();
            timesEquals(sums[i], s > 0 ? 1. / s : 1.);
        }
        double delta = maxMoved(means, sums, sep);
        updateBounds(relation, assignment, upper, lower, sep, delta);
        for (int i = 0; i < k; i++) {
            final int s = clusters.get(i).size();
            System.arraycopy(sums[i], 0, means[i], 0, dim);
            // Restore to sum for next iteration
            timesEquals(sums[i], s > 0 ? s : 1.);
        }
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    upper.destroy();
    lower.destroy();
    // Wrap result
    double totalvariance = 0.;
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.size(); i++) {
        DBIDs ids = clusters.get(i);
        if (ids.size() == 0) {
            continue;
        }
        double[] mean = means[i];
        double varsum = 0.;
        if (varstat) {
            DoubleVector mvec = DoubleVector.wrap(mean);
            for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
                varsum += distanceFunction.distance(mvec, relation.get(it));
            }
            totalvariance += varsum;
        }
        KMeansModel model = new KMeansModel(mean, varsum);
        result.addToplevelCluster(new Cluster<>(ids, model));
    }
    if (LOG.isStatistics() && varstat) {
        LOG.statistics(new DoubleStatistic(this.getClass().getName() + ".variance-sum", totalvariance));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector)

Example 89 with DoubleVector

use of de.lmu.ifi.dbs.elki.data.DoubleVector in project elki by elki-project.

the class AbstractKMeansQualityMeasure method varianceOfCluster.

/**
 * Variance contribution of a single cluster.
 *
 * If possible, this information is reused from the clustering process (when a
 * KMeansModel is returned).
 *
 * @param cluster Cluster to access
 * @param distanceFunction Distance function
 * @param relation Data relation
 * @param <V> Vector type
 * @return Cluster variance
 */
public static <V extends NumberVector> double varianceOfCluster(Cluster<? extends MeanModel> cluster, NumberVectorDistanceFunction<? super V> distanceFunction, Relation<V> relation) {
    MeanModel model = cluster.getModel();
    if (model instanceof KMeansModel) {
        return ((KMeansModel) model).getVarianceContribution();
    }
    // Re-compute:
    DBIDs ids = cluster.getIDs();
    DoubleVector mean = DoubleVector.wrap(model.getMean());
    boolean squared = distanceFunction.isSquared();
    double variance = 0.;
    for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
        double dist = distanceFunction.distance(relation.get(iter), mean);
        variance += squared ? dist : dist * dist;
    }
    return variance;
}
Also used : KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) MeanModel(de.lmu.ifi.dbs.elki.data.model.MeanModel) DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 90 with DoubleVector

use of de.lmu.ifi.dbs.elki.data.DoubleVector in project elki by elki-project.

the class DBSCANTest method testDBSCANOnSingleLinkDataset.

/**
 * Run DBSCAN with fixed parameters and compare the result to a golden
 * standard.
 */
@Test
public void testDBSCANOnSingleLinkDataset() {
    Database db = makeSimpleDatabase(UNITTEST + "single-link-effect.ascii", 638);
    Clustering<Model> result = // 
    new ELKIBuilder<DBSCAN<DoubleVector>>(DBSCAN.class).with(DBSCAN.Parameterizer.EPSILON_ID, // 
    11.5).with(DBSCAN.Parameterizer.MINPTS_ID, // 
    120).build().run(db);
    testFMeasure(db, result, 0.954382);
    testClusterSizes(result, new int[] { 11, 200, 203, 224 });
}
Also used : ELKIBuilder(de.lmu.ifi.dbs.elki.utilities.ELKIBuilder) Database(de.lmu.ifi.dbs.elki.database.Database) Model(de.lmu.ifi.dbs.elki.data.model.Model) DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector) Test(org.junit.Test)

Aggregations

DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)147 Test (org.junit.Test)112 Database (de.lmu.ifi.dbs.elki.database.Database)85 ELKIBuilder (de.lmu.ifi.dbs.elki.utilities.ELKIBuilder)75 AbstractClusterAlgorithmTest (de.lmu.ifi.dbs.elki.algorithm.clustering.AbstractClusterAlgorithmTest)50 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)26 AbstractDataSourceTest (de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest)24 OutlierResult (de.lmu.ifi.dbs.elki.result.outlier.OutlierResult)22 AbstractOutlierAlgorithmTest (de.lmu.ifi.dbs.elki.algorithm.outlier.AbstractOutlierAlgorithmTest)16 ArrayList (java.util.ArrayList)14 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)12 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)11 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)10 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)9 ListParameterization (de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization)9 Random (java.util.Random)9 AbstractSimpleAlgorithmTest (de.lmu.ifi.dbs.elki.algorithm.AbstractSimpleAlgorithmTest)8 Model (de.lmu.ifi.dbs.elki.data.model.Model)8 LinearScanDistanceKNNQuery (de.lmu.ifi.dbs.elki.database.query.knn.LinearScanDistanceKNNQuery)8 MedoidModel (de.lmu.ifi.dbs.elki.data.model.MedoidModel)7