Search in sources :

Example 26 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class KMeansMinusMinus method run.

@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    // Intialisieren der means
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // initialisieren vom Heap
    final int heapsize = (int) (rate < 1. ? Math.ceil(relation.size() * rate) : rate);
    DoubleMinHeap minHeap = new DoubleMinHeap(heapsize);
    // Setup cluster assignment store
    List<ModifiableDoubleDBIDList> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        clusters.add(DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat = new DoubleStatistic(this.getClass().getName() + ".variance-sum");
    // Otherwise, the vartotal break below will fail!
    assert (varstat != null);
    int iteration = 0;
    double prevvartotal = Double.POSITIVE_INFINITY;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
        minHeap.clear();
        for (int i = 0; i < k; i++) {
            clusters.get(i).clear();
        }
        LOG.incrementProcessed(prog);
        boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum, minHeap, heapsize);
        double vartotal = logVarstat(varstat, varsum);
        // than the previous value.
        if (!changed || vartotal > prevvartotal) {
            break;
        }
        prevvartotal = vartotal;
        // Recompute means.
        means = meansWithTreshhold(clusters, means, relation, heapsize > 0 ? minHeap.peek() : Double.POSITIVE_INFINITY);
    }
    // create noisecluster if wanted
    ModifiableDoubleDBIDList noiseids = null;
    if (noiseFlag && heapsize > 0) {
        clusters.add(noiseids = DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
        double tresh = minHeap.peek();
        for (int i = 0; i < k; i++) {
            for (DoubleDBIDListMIter it = clusters.get(i).iter(); it.valid(); it.advance()) {
                final double dist = it.doubleValue();
                // Add to the noise cluster:
                if (dist >= tresh) {
                    noiseids.add(dist, it);
                    assignment.putInt(it, k);
                    it.remove();
                }
            }
        }
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < k; i++) {
        DBIDs ids = clusters.get(i);
        if (ids.size() == 0) {
            continue;
        }
        KMeansModel model = new KMeansModel(means[i], varsum[i]);
        result.addToplevelCluster(new Cluster<>(ids, model));
    }
    // Noise Cluster
    if (noiseFlag) {
        KMeansModel model = new KMeansModel(null, 0);
        DBIDs ids = noiseids;
        if (ids.size() == 0) {
            return result;
        }
        result.addToplevelCluster(new Cluster<>(ids, true, model));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) DoubleMinHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap) ModifiableDoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) DoubleDBIDListMIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListMIter) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 27 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class LogClusterSizes method logClusterSizes.

/**
 * Log the cluster sizes of a clustering.
 *
 * @param c Clustering ot analyze
 */
public static <C extends Model> void logClusterSizes(Clustering<C> c) {
    if (!LOG.isStatistics()) {
        return;
    }
    final List<Cluster<C>> clusters = c.getAllClusters();
    final int numc = clusters.size();
    LOG.statistics(new StringStatistic(PREFIX + "name", c.getLongName()));
    LOG.statistics(new LongStatistic(PREFIX + "clusters", numc));
    Hierarchy<Cluster<C>> h = c.getClusterHierarchy();
    int cnum = 0;
    for (Cluster<C> clu : clusters) {
        final String p = PREFIX + "cluster-" + cnum + ".";
        if (clu.getName() != null) {
            LOG.statistics(new StringStatistic(p + "name", clu.getName()));
        }
        LOG.statistics(new LongStatistic(p + "size", clu.size()));
        if (clu.isNoise()) {
            LOG.statistics(new StringStatistic(p + "noise", "true"));
        }
        if (h.numChildren(clu) > 0) {
            // TODO: this only works if we have cluster names!
            StringBuilder buf = new StringBuilder();
            for (It<Cluster<C>> it = h.iterChildren(clu); it.valid(); it.advance()) {
                if (buf.length() > 0) {
                    buf.append(", ");
                }
                buf.append(it.get().getName());
            }
            LOG.statistics(new StringStatistic(p + "children", buf.toString()));
        }
        // TODO: also log parents?
        ++cnum;
    }
}
Also used : StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) Cluster(de.lmu.ifi.dbs.elki.data.Cluster)

Example 28 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class EvaluateConcordantPairs method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return Gamma index
 */
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    int ignorednoise = 0, withinPairs = 0;
    for (Cluster<?> cluster : clusters) {
        if ((cluster.size() <= 1 || cluster.isNoise())) {
            switch(noiseHandling) {
                case IGNORE_NOISE:
                    ignorednoise += cluster.size();
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    // No concordant distances.
                    continue;
                case MERGE_NOISE:
                    // Treat like a cluster below.
                    break;
            }
        }
        withinPairs += (cluster.size() * (cluster.size() - 1)) >>> 1;
        if (withinPairs < 0) {
            throw new AbortException("Integer overflow - clusters too large to compute pairwise distances.");
        }
    }
    // Materialize within-cluster distances (sorted):
    double[] withinDistances = computeWithinDistances(rel, clusters, withinPairs);
    int[] withinTies = new int[withinDistances.length];
    // Count ties within
    countTies(withinDistances, withinTies);
    long concordantPairs = 0, discordantPairs = 0, betweenPairs = 0;
    // Step two, compute discordant distances:
    for (int i = 0; i < clusters.size(); i++) {
        Cluster<?> ocluster1 = clusters.get(i);
        if (// 
        (ocluster1.size() <= 1 || ocluster1.isNoise()) && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) {
            continue;
        }
        for (int j = i + 1; j < clusters.size(); j++) {
            Cluster<?> ocluster2 = clusters.get(j);
            if (// 
            (ocluster2.size() <= 1 || ocluster2.isNoise()) && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) {
                continue;
            }
            betweenPairs += ocluster1.size() * ocluster2.size();
            for (DBIDIter oit1 = ocluster1.getIDs().iter(); oit1.valid(); oit1.advance()) {
                NumberVector obj = rel.get(oit1);
                for (DBIDIter oit2 = ocluster2.getIDs().iter(); oit2.valid(); oit2.advance()) {
                    double dist = distanceFunction.distance(obj, rel.get(oit2));
                    int p = Arrays.binarySearch(withinDistances, dist);
                    if (p >= 0) {
                        // Tied distances:
                        while (p > 0 && withinDistances[p - 1] >= dist) {
                            --p;
                        }
                        concordantPairs += p;
                        discordantPairs += withinDistances.length - p - withinTies[p];
                        continue;
                    }
                    p = -p - 1;
                    concordantPairs += p;
                    discordantPairs += withinDistances.length - p;
                }
            }
        }
    }
    // Total number of pairs possible:
    final long t = ((rel.size() - ignorednoise) * (long) (rel.size() - ignorednoise - 1)) >>> 1;
    final long tt = (t * (t - 1)) >>> 1;
    double gamma = (concordantPairs - discordantPairs) / (double) (concordantPairs + discordantPairs);
    double tau = computeTau(concordantPairs, discordantPairs, tt, withinDistances.length, betweenPairs);
    // Avoid NaN when everything is in a single cluster:
    gamma = gamma > 0. ? gamma : 0.;
    tau = tau > 0. ? tau : 0.;
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".pbm.noise-handling", noiseHandling.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".pbm.ignored", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".gamma", gamma));
        LOG.statistics(new DoubleStatistic(key + ".tau", tau));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Concordance-based Evaluation");
    g.addMeasure("Gamma", gamma, -1., 1., 0., false);
    g.addMeasure("Tau", tau, -1., +1., 0., false);
    db.getHierarchy().resultChanged(ev);
    return gamma;
}
Also used : MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 29 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class EvaluateDaviesBouldin method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return DB-index
 */
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    NumberVector[] centroids = new NumberVector[clusters.size()];
    int noisecount = EvaluateSimplifiedSilhouette.centroids(rel, clusters, centroids, noiseOption);
    double[] withinGroupDistance = withinGroupDistances(rel, clusters, centroids);
    Mean daviesBouldin = new Mean();
    for (int i = 0; i < clusters.size(); i++) {
        final NumberVector centroid = centroids[i];
        final double withinGroupDistancei = withinGroupDistance[i];
        // maximum within-to-between cluster spread
        double max = 0;
        for (int j = 0; j < clusters.size(); j++) {
            NumberVector ocentroid = centroids[j];
            if (ocentroid == centroid) {
                continue;
            }
            // Both are real clusters:
            if (centroid != null && ocentroid != null) {
                // bD = between group distance
                double bD = distanceFunction.distance(centroid, ocentroid);
                // d = within-to-between cluster spread
                double d = (withinGroupDistancei + withinGroupDistance[j]) / bD;
                max = d > max ? d : max;
            } else if (noiseOption != NoiseHandling.IGNORE_NOISE) {
                if (centroid != null) {
                    double d = Double.POSITIVE_INFINITY;
                    // Find the closest element
                    for (DBIDIter it = clusters.get(j).getIDs().iter(); it.valid(); it.advance()) {
                        double d2 = distanceFunction.distance(centroid, rel.get(it));
                        d = d2 < d ? d2 : d;
                    }
                    d = withinGroupDistancei / d;
                    max = d > max ? d : max;
                } else if (ocentroid != null) {
                    double d = Double.POSITIVE_INFINITY;
                    // Find the closest element
                    for (DBIDIter it = clusters.get(i).getIDs().iter(); it.valid(); it.advance()) {
                        double d2 = distanceFunction.distance(rel.get(it), ocentroid);
                        d = d2 < d ? d2 : d;
                    }
                    d = withinGroupDistance[j] / d;
                    max = d > max ? d : max;
                }
            // else: (0+0) / d = 0.
            }
        }
        daviesBouldin.put(max);
    }
    // For a single cluster, we return 2 (result for equidistant points)
    final double daviesBouldinMean = daviesBouldin.getCount() > 1 ? daviesBouldin.getMean() : 2.;
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".db-index.noise-handling", noiseOption.toString()));
        if (noisecount > 0) {
            LOG.statistics(new LongStatistic(key + ".db-index.ignored", noisecount));
        }
        LOG.statistics(new DoubleStatistic(key + ".db-index", daviesBouldinMean));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("Davies Bouldin Index", daviesBouldinMean, 0., Double.POSITIVE_INFINITY, 0., true);
    db.getHierarchy().resultChanged(ev);
    return daviesBouldinMean;
}
Also used : Mean(de.lmu.ifi.dbs.elki.math.Mean) MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 30 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class EvaluateCIndex method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return C-Index
 */
public double evaluateClustering(Database db, Relation<? extends O> rel, DistanceQuery<O> dq, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    // Count ignored noise, and within-cluster distances
    int ignorednoise = 0, w = 0;
    for (Cluster<?> cluster : clusters) {
        if (cluster.size() <= 1 || cluster.isNoise()) {
            switch(noiseOption) {
                case IGNORE_NOISE:
                    ignorednoise += cluster.size();
                    // Ignore
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    // No within-cluster distances!
                    continue;
                case MERGE_NOISE:
                    // Treat like a cluster
                    break;
                default:
                    LOG.warning("Unknown noise handling option: " + noiseOption);
            }
        }
        w += (cluster.size() * (cluster.size() - 1)) >>> 1;
    }
    // TODO: for small k=2, and balanced clusters, it may be more efficient to
    // just build a long array with all distances, and select the quantiles.
    // The heaps used below pay off in memory consumption for k > 2
    // Yes, maxDists is supposed to be a min heap, and the other way.
    // Because we want to replace the smallest of the current k-largest
    // distances.
    DoubleHeap maxDists = new DoubleMinHeap(w);
    DoubleHeap minDists = new DoubleMaxHeap(w);
    // Sum of within-cluster distances
    double theta = 0.;
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Processing clusters for C-Index", clusters.size(), LOG) : null;
    for (int i = 0; i < clusters.size(); i++) {
        Cluster<?> cluster = clusters.get(i);
        if (cluster.size() <= 1 || cluster.isNoise()) {
            switch(noiseOption) {
                case IGNORE_NOISE:
                    LOG.incrementProcessed(prog);
                    // Ignore
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    processSingleton(cluster, rel, dq, maxDists, minDists, w);
                    LOG.incrementProcessed(prog);
                    continue;
                case MERGE_NOISE:
                    // Treat like a cluster, below
                    break;
            }
        }
        theta += processCluster(cluster, clusters, i, dq, maxDists, minDists, w);
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    // Simulate best and worst cases:
    // Sum of largest and smallest
    double min = 0, max = 0;
    assert (minDists.size() == w);
    assert (maxDists.size() == w);
    for (DoubleHeap.UnsortedIter it = minDists.unsortedIter(); it.valid(); it.advance()) {
        min += it.get();
    }
    for (DoubleHeap.UnsortedIter it = maxDists.unsortedIter(); it.valid(); it.advance()) {
        max += it.get();
    }
    assert (max >= min);
    double cIndex = (max > min) ? (theta - min) / (max - min) : 1.;
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".c-index.noise-handling", noiseOption.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".c-index.ignored", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".c-index", cIndex));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("C-Index", cIndex, 0., 1., 0., true);
    db.getHierarchy().resultChanged(ev);
    return cIndex;
}
Also used : DoubleMinHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DoubleHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleHeap) MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DoubleMaxHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMaxHeap) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Aggregations

LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)44 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)27 ArrayList (java.util.ArrayList)20 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)19 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)17 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)14 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)14 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)14 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)12 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)11 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)10 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)9 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)8 EvaluationResult (de.lmu.ifi.dbs.elki.result.EvaluationResult)7 MeasurementGroup (de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup)7 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)5 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)5 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)4 Logging (de.lmu.ifi.dbs.elki.logging.Logging)4 Duration (de.lmu.ifi.dbs.elki.logging.statistics.Duration)4