Search in sources :

Example 36 with DoubleStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic in project elki by elki-project.

the class EvaluateSquaredErrors method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return ssq
 */
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    boolean square = !distance.isSquared();
    int ignorednoise = 0;
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    double ssq = 0, sum = 0;
    for (Cluster<?> cluster : clusters) {
        if (cluster.size() <= 1 || cluster.isNoise()) {
            switch(noiseOption) {
                case IGNORE_NOISE:
                    ignorednoise += cluster.size();
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    continue;
                case MERGE_NOISE:
                    // Treat as cluster below:
                    break;
            }
        }
        NumberVector center = ModelUtil.getPrototypeOrCentroid(cluster.getModel(), rel, cluster.getIDs());
        for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
            final double d = distance.distance(center, rel.get(it1));
            sum += d;
            ssq += square ? d * d : d;
        }
    }
    final int div = Math.max(1, rel.size() - ignorednoise);
    if (LOG.isStatistics()) {
        LOG.statistics(new DoubleStatistic(key + ".mean", sum / div));
        LOG.statistics(new DoubleStatistic(key + ".ssq", ssq));
        LOG.statistics(new DoubleStatistic(key + ".rmsd", FastMath.sqrt(ssq / div)));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("Mean distance", sum / div, 0., Double.POSITIVE_INFINITY, true);
    g.addMeasure("Sum of Squares", ssq, 0., Double.POSITIVE_INFINITY, true);
    g.addMeasure("RMSD", FastMath.sqrt(ssq / div), 0., Double.POSITIVE_INFINITY, true);
    db.getHierarchy().add(c, ev);
    return ssq;
}
Also used : DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 37 with DoubleStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic in project elki by elki-project.

the class FPGrowth method run.

/**
 * Run the FP-Growth algorithm
 *
 * @param db Database to process
 * @param relation Bit vector relation
 * @return Frequent patterns found
 */
public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) {
    // TODO: implement with resizable array, to not need dim.
    final int dim = RelationUtil.dimensionality(relation);
    final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
    // Compute absolute minsupport
    final int minsupp = getMinimumSupport(relation.size());
    LOG.verbose("Finding item frequencies for ordering.");
    final int[] counts = countItemSupport(relation, dim);
    // Forward and backward indexes
    int[] iidx = new int[dim];
    final int[] idx = buildIndex(counts, iidx, minsupp);
    final int items = idx.length;
    LOG.statistics(new LongStatistic(STAT + "raw-items", dim));
    LOG.statistics(new LongStatistic(STAT + "raw-transactions", relation.size()));
    LOG.statistics(new DoubleStatistic(STAT + "minsupp-relative", minsupp / (double) relation.size()));
    LOG.statistics(new LongStatistic(STAT + "minsupp-absolute", minsupp));
    LOG.verbose("Building FP-Tree.");
    Duration ctime = LOG.newDuration(STAT + "fp-tree.construction.time").begin();
    FPTree tree = buildFPTree(relation, iidx, items);
    if (LOG.isStatistics()) {
        tree.logStatistics();
    }
    if (LOG.isDebuggingFinest()) {
        StringBuilder buf = new StringBuilder(10000).append("FP-tree:\n");
        tree.appendTo(buf, new FPNode.Translator() {

            @Override
            public StringBuilder appendTo(StringBuilder buf, int i) {
                String l = meta.getLabel(idx[i]);
                return (l != null) ? buf.append(l) : buf.append(i);
            }
        });
        LOG.debugFinest(buf.toString());
    }
    // Reduce memory usage:
    tree.reduceMemory();
    LOG.statistics(ctime.end());
    LOG.verbose("Extracting frequent patterns.");
    Duration etime = LOG.newDuration(STAT + "fp-growth.extraction.time").begin();
    final IndefiniteProgress itemp = LOG.isVerbose() ? new IndefiniteProgress("Frequent itemsets", LOG) : null;
    final List<Itemset> solution = new ArrayList<>();
    // Start extraction with the least frequent items
    tree.extract(minsupp, minlength, maxlength, true, new FPTree.Collector() {

        @Override
        public void collect(int support, int[] data, int start, int plen) {
            // Always translate the indexes back to the original values via 'idx'!
            if (plen - start == 1) {
                solution.add(new OneItemset(idx[data[start]], support));
                LOG.incrementProcessed(itemp);
                return;
            }
            // Copy from buffer to a permanent storage
            int[] indices = new int[plen - start];
            for (int i = start, j = 0; i < plen; i++) {
                // Translate to original items
                indices[j++] = idx[data[i]];
            }
            Arrays.sort(indices);
            solution.add(new SparseItemset(indices, support));
            LOG.incrementProcessed(itemp);
        }
    });
    LOG.setCompleted(itemp);
    Collections.sort(solution);
    LOG.statistics(etime.end());
    LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size()));
    return new FrequentItemsetsResult("FP-Growth", "fp-growth", solution, meta, relation.size());
}
Also used : BitVector(de.lmu.ifi.dbs.elki.data.BitVector) ArrayList(java.util.ArrayList) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration) FrequentItemsetsResult(de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 38 with DoubleStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic in project elki by elki-project.

the class NNDescent method preprocess.

@Override
protected void preprocess() {
    final DBIDs ids = relation.getDBIDs();
    final long starttime = System.currentTimeMillis();
    IndefiniteProgress progress = LOG.isVerbose() ? new IndefiniteProgress("KNNGraph iteration", LOG) : null;
    // to add query point itself in the end, internally (k-1) is used
    final int internal_k = k - 1;
    // kNN store
    store = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, KNNHeap.class);
    // store for new reverse neighbors
    WritableDataStore<HashSetModifiableDBIDs> newReverseNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
    // store for new reverse neighbors
    WritableDataStore<HashSetModifiableDBIDs> oldReverseNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
    // Sample of new forward neighbors.
    WritableDataStore<HashSetModifiableDBIDs> sampleNewNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
    // data structures for new and sampled new neighbors
    WritableDataStore<HashSetModifiableDBIDs> flag = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
    // Initialize data structures:
    for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
        store.put(iditer, DBIDUtil.newHeap(internal_k));
        newReverseNeighbors.put(iditer, DBIDUtil.newHashSet());
        oldReverseNeighbors.put(iditer, DBIDUtil.newHashSet());
    }
    // this variable is the sampling size
    final int items = (int) Math.ceil(rho * internal_k);
    long counter_all = 0;
    // initialize neighbors (depends on -setInitialNeighbors option)
    for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
        // initialize sampled NN
        ModifiableDBIDs sampleNew = DBIDUtil.randomSampleExcept(ids, iditer, items, rnd);
        sampleNewNeighbors.put(iditer, DBIDUtil.newHashSet(sampleNew));
        // initialize RNN
        ModifiableDBIDs sampleRev = DBIDUtil.randomSampleExcept(ids, iditer, items, rnd);
        newReverseNeighbors.put(iditer, DBIDUtil.newHashSet(sampleRev));
        // initialize new neighbors
        flag.put(iditer, DBIDUtil.newHashSet());
        // initialize store
        if (!noInitialNeighbors) {
            HashSetModifiableDBIDs flags = flag.get(iditer);
            for (DBIDIter siter = sampleNew.iter(); siter.valid(); siter.advance()) {
                if (add(iditer, siter, distanceQuery.distance(iditer, siter))) {
                    flags.add(siter);
                }
            }
            counter_all += sampleNew.size();
        }
    }
    final int size = relation.size();
    double rate = 0.0;
    int iter = 0;
    for (; iter < iterations; iter++) {
        long counter = 0;
        // iterate through dataset
        for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
            // determine new and old neighbors
            HashSetModifiableDBIDs newNeighbors = flag.get(iditer);
            HashSetModifiableDBIDs oldNeighbors = DBIDUtil.newHashSet();
            KNNHeap heap = store.get(iditer);
            for (DoubleDBIDListIter heapiter = heap.unorderedIterator(); heapiter.valid(); heapiter.advance()) {
                if (!newNeighbors.contains(heapiter)) {
                    oldNeighbors.add(heapiter);
                }
            }
            // Sampling
            HashSetModifiableDBIDs sampleNew = sampleNewNeighbors.get(iditer);
            HashSetModifiableDBIDs newRev = newReverseNeighbors.get(iditer);
            newRev.removeDBIDs(sampleNew);
            boundSize(newRev, items);
            HashSetModifiableDBIDs oldRev = oldReverseNeighbors.get(iditer);
            oldRev.removeDBIDs(oldNeighbors);
            boundSize(oldRev, items);
            counter += processNewNeighbors(flag, sampleNew, oldNeighbors, newRev, oldRev);
        }
        counter_all += counter;
        if (LOG.isStatistics()) {
            LOG.statistics(new DoubleStatistic(prefix + ".scan-rate", counter_all * .5 / (size * (size - 1L))));
        }
        // t is the number of new neighbors
        int t = sampleNew(ids, sampleNewNeighbors, flag, items);
        // calculate old and new reverse neighbors
        clearAll(ids, newReverseNeighbors);
        clearAll(ids, oldReverseNeighbors);
        reverse(sampleNewNeighbors, newReverseNeighbors, oldReverseNeighbors);
        rate = (double) t / (double) (internal_k * size);
        if (LOG.isStatistics()) {
            LOG.statistics(new DoubleStatistic(prefix + ".update-rate", rate));
        }
        if (counter < delta * internal_k * size) {
            LOG.verbose("KNNGraph terminated because we performaned delta*k*size distance computations.");
            break;
        }
        if (rate < delta) {
            LOG.verbose("KNNGraph terminated because update rate got smaller than delta.");
            break;
        }
        LOG.incrementProcessed(progress);
    }
    if (LOG.isVerbose() && iter == iterations) {
        LOG.verbose("KNNGraph terminated because the maximum number of iterations was reached.");
    }
    LOG.setCompleted(progress);
    // convert store to storage
    storage = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_DB, KNNList.class);
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        KNNHeap tempHeap = DBIDUtil.newHeap(k);
        // Add query point and convert heap to list:
        KNNHeap heap = store.get(iditer);
        tempHeap.insert(0, iditer);
        for (DoubleDBIDListIter heapiter = heap.unorderedIterator(); heapiter.valid(); heapiter.advance()) {
            tempHeap.insert(heapiter.doubleValue(), heapiter);
        }
        storage.put(iditer, tempHeap.toKNNList());
    }
    final long end = System.currentTimeMillis();
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(prefix + ".construction-time.ms", end - starttime));
    }
}
Also used : DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Aggregations

DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)38 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)27 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)17 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)14 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)14 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)13 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)13 ArrayList (java.util.ArrayList)13 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)12 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)10 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)10 MeasurementGroup (de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup)10 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)9 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)8 EvaluationResult (de.lmu.ifi.dbs.elki.result.EvaluationResult)8 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)7 Duration (de.lmu.ifi.dbs.elki.logging.statistics.Duration)5 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)3 ModifiableDoubleDBIDList (de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList)3 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)3