Search in sources :

Example 36 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class APRIORI method aprioriGenerate.

/**
 * Prunes a given set of candidates to keep only those BitSets where all
 * subsets of bits flipping one bit are frequent already.
 *
 * @param supported Support map
 * @param length Itemset length
 * @param dim Dimensionality
 * @return itemsets that cannot be pruned by apriori
 */
protected List<Itemset> aprioriGenerate(List<? extends Itemset> supported, int length, int dim) {
    if (supported.size() < length) {
        return Collections.emptyList();
    }
    long joined = 0L;
    final int ssize = supported.size();
    List<Itemset> candidateList = new ArrayList<>();
    Itemset ref = supported.get(0);
    if (ref instanceof SparseItemset) {
        // TODO: we currently never switch to DenseItemSet. This may however be
        // beneficial when we have few dimensions and many candidates.
        // E.g. when length > 32 and dim < 100. But this needs benchmarking!
        // For length < 5 and dim > 3000, SparseItemset unsurprisingly was faster
        // Scratch item to use for searching.
        SparseItemset scratch = new SparseItemset(new int[length - 1]);
        for (int i = 0; i < ssize; i++) {
            SparseItemset ii = (SparseItemset) supported.get(i);
            prefix: for (int j = i + 1; j < ssize; j++) {
                SparseItemset ij = (SparseItemset) supported.get(j);
                if (!ii.prefixTest(ij)) {
                    // Prefix doesn't match
                    break prefix;
                }
                joined++;
                // Test subsets (re-) using scratch object
                System.arraycopy(ii.indices, 1, scratch.indices, 0, length - 2);
                scratch.indices[length - 2] = ij.indices[length - 2];
                for (int k = length - 3; k >= 0; k--) {
                    scratch.indices[k] = ii.indices[k + 1];
                    int pos = Collections.binarySearch(supported, scratch);
                    if (pos < 0) {
                        // Prefix was okay, but one other subset was not frequent
                        continue prefix;
                    }
                }
                int[] items = new int[length];
                System.arraycopy(ii.indices, 0, items, 0, length - 1);
                items[length - 1] = ij.indices[length - 2];
                candidateList.add(new SparseItemset(items));
            }
        }
    } else if (ref instanceof DenseItemset) {
        // Scratch item to use for searching.
        DenseItemset scratch = new DenseItemset(BitsUtil.zero(dim), length - 1);
        for (int i = 0; i < ssize; i++) {
            DenseItemset ii = (DenseItemset) supported.get(i);
            prefix: for (int j = i + 1; j < ssize; j++) {
                DenseItemset ij = (DenseItemset) supported.get(j);
                // Prefix test via "|i1 ^ i2| = 2"
                System.arraycopy(ii.items, 0, scratch.items, 0, ii.items.length);
                BitsUtil.xorI(scratch.items, ij.items);
                if (BitsUtil.cardinality(scratch.items) != 2) {
                    // No prefix match; since sorted, no more can follow!
                    break prefix;
                }
                ++joined;
                // Ensure that the first difference is the last item in ii:
                int first = BitsUtil.nextSetBit(scratch.items, 0);
                if (BitsUtil.nextSetBit(ii.items, first + 1) > -1) {
                    // Different overlap by chance?
                    break prefix;
                }
                BitsUtil.orI(scratch.items, ij.items);
                // Test subsets.
                for (int l = length, b = BitsUtil.nextSetBit(scratch.items, 0); l > 2; l--, b = BitsUtil.nextSetBit(scratch.items, b + 1)) {
                    BitsUtil.clearI(scratch.items, b);
                    int pos = Collections.binarySearch(supported, scratch);
                    if (pos < 0) {
                        continue prefix;
                    }
                    BitsUtil.setI(scratch.items, b);
                }
                candidateList.add(new DenseItemset(scratch.items.clone(), length));
            }
        }
    } else {
        throw new InconsistentDataException("Unexpected itemset type " + ref.getClass());
    }
    if (LOG.isStatistics()) {
        // Naive pairwise approach
        LOG.statistics(new LongStatistic(STAT + length + "-items.pairwise", (ssize * ((long) ssize - 1))));
        LOG.statistics(new LongStatistic(STAT + length + "-items.joined", joined));
        LOG.statistics(new LongStatistic(STAT + length + "-items.candidates", candidateList.size()));
    }
    // So we do not need to sort here.
    return candidateList;
}
Also used : LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ArrayList(java.util.ArrayList) InconsistentDataException(de.lmu.ifi.dbs.elki.utilities.exceptions.InconsistentDataException)

Example 37 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class APRIORI method run.

/**
 * Performs the APRIORI algorithm on the given database.
 *
 * @param relation the Relation to process
 * @return the AprioriResult learned by this APRIORI
 */
public FrequentItemsetsResult run(Relation<BitVector> relation) {
    DBIDs ids = relation.getDBIDs();
    List<Itemset> solution = new ArrayList<>();
    final int size = ids.size();
    final int needed = getMinimumSupport(size);
    // TODO: we don't strictly require a vector field.
    // We could work with knowing just the maximum dimensionality beforehand.
    VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
    if (size > 0) {
        final int dim = meta.getDimensionality();
        Duration timeone = LOG.newDuration(STAT + "1-items.time").begin();
        List<OneItemset> oneitems = buildFrequentOneItemsets(relation, dim, needed);
        LOG.statistics(timeone.end());
        if (LOG.isStatistics()) {
            LOG.statistics(new LongStatistic(STAT + "1-items.frequent", oneitems.size()));
            LOG.statistics(new LongStatistic(STAT + "1-items.transactions", ids.size()));
        }
        if (LOG.isDebuggingFine()) {
            LOG.debugFine(debugDumpCandidates(new StringBuilder(), oneitems, meta));
        }
        if (minlength <= 1) {
            solution.addAll(oneitems);
        }
        if (oneitems.size() >= 2 && maxlength >= 2) {
            Duration timetwo = LOG.newDuration(STAT + "2-items.time").begin();
            ArrayModifiableDBIDs survivors = DBIDUtil.newArray(ids.size());
            List<? extends Itemset> candidates = buildFrequentTwoItemsets(oneitems, relation, dim, needed, ids, survivors);
            // Continue with reduced set of transactions.
            ids = survivors;
            LOG.statistics(timetwo.end());
            if (LOG.isStatistics()) {
                LOG.statistics(new LongStatistic(STAT + "2-items.frequent", candidates.size()));
                LOG.statistics(new LongStatistic(STAT + "2-items.transactions", ids.size()));
            }
            if (LOG.isDebuggingFine()) {
                LOG.debugFine(debugDumpCandidates(new StringBuilder(), candidates, meta));
            }
            if (minlength <= 2) {
                solution.addAll(candidates);
            }
            for (int length = 3; length <= maxlength && candidates.size() >= length; length++) {
                Duration timel = LOG.newDuration(STAT + length + "-items.time").begin();
                // Join to get the new candidates
                candidates = aprioriGenerate(candidates, length, dim);
                if (LOG.isDebuggingFinest()) {
                    LOG.debugFinest(debugDumpCandidates(new StringBuilder().append("Before pruning: "), candidates, meta));
                }
                survivors = DBIDUtil.newArray(ids.size());
                candidates = frequentItemsets(candidates, relation, needed, ids, survivors, length);
                // Continue with reduced set of transactions.
                ids = survivors;
                LOG.statistics(timel.end());
                if (LOG.isStatistics()) {
                    LOG.statistics(new LongStatistic(STAT + length + "-items.frequent", candidates.size()));
                    LOG.statistics(new LongStatistic(STAT + length + "-items.transactions", ids.size()));
                }
                if (LOG.isDebuggingFine()) {
                    LOG.debugFine(debugDumpCandidates(new StringBuilder(), candidates, meta));
                }
                solution.addAll(candidates);
            }
        }
    }
    return new FrequentItemsetsResult("APRIORI", "apriori", solution, meta, size);
}
Also used : BitVector(de.lmu.ifi.dbs.elki.data.BitVector) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ArrayList(java.util.ArrayList) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration) FrequentItemsetsResult(de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 38 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class FPGrowth method run.

/**
 * Run the FP-Growth algorithm
 *
 * @param db Database to process
 * @param relation Bit vector relation
 * @return Frequent patterns found
 */
public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) {
    // TODO: implement with resizable array, to not need dim.
    final int dim = RelationUtil.dimensionality(relation);
    final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
    // Compute absolute minsupport
    final int minsupp = getMinimumSupport(relation.size());
    LOG.verbose("Finding item frequencies for ordering.");
    final int[] counts = countItemSupport(relation, dim);
    // Forward and backward indexes
    int[] iidx = new int[dim];
    final int[] idx = buildIndex(counts, iidx, minsupp);
    final int items = idx.length;
    LOG.statistics(new LongStatistic(STAT + "raw-items", dim));
    LOG.statistics(new LongStatistic(STAT + "raw-transactions", relation.size()));
    LOG.statistics(new DoubleStatistic(STAT + "minsupp-relative", minsupp / (double) relation.size()));
    LOG.statistics(new LongStatistic(STAT + "minsupp-absolute", minsupp));
    LOG.verbose("Building FP-Tree.");
    Duration ctime = LOG.newDuration(STAT + "fp-tree.construction.time").begin();
    FPTree tree = buildFPTree(relation, iidx, items);
    if (LOG.isStatistics()) {
        tree.logStatistics();
    }
    if (LOG.isDebuggingFinest()) {
        StringBuilder buf = new StringBuilder(10000).append("FP-tree:\n");
        tree.appendTo(buf, new FPNode.Translator() {

            @Override
            public StringBuilder appendTo(StringBuilder buf, int i) {
                String l = meta.getLabel(idx[i]);
                return (l != null) ? buf.append(l) : buf.append(i);
            }
        });
        LOG.debugFinest(buf.toString());
    }
    // Reduce memory usage:
    tree.reduceMemory();
    LOG.statistics(ctime.end());
    LOG.verbose("Extracting frequent patterns.");
    Duration etime = LOG.newDuration(STAT + "fp-growth.extraction.time").begin();
    final IndefiniteProgress itemp = LOG.isVerbose() ? new IndefiniteProgress("Frequent itemsets", LOG) : null;
    final List<Itemset> solution = new ArrayList<>();
    // Start extraction with the least frequent items
    tree.extract(minsupp, minlength, maxlength, true, new FPTree.Collector() {

        @Override
        public void collect(int support, int[] data, int start, int plen) {
            // Always translate the indexes back to the original values via 'idx'!
            if (plen - start == 1) {
                solution.add(new OneItemset(idx[data[start]], support));
                LOG.incrementProcessed(itemp);
                return;
            }
            // Copy from buffer to a permanent storage
            int[] indices = new int[plen - start];
            for (int i = start, j = 0; i < plen; i++) {
                // Translate to original items
                indices[j++] = idx[data[i]];
            }
            Arrays.sort(indices);
            solution.add(new SparseItemset(indices, support));
            LOG.incrementProcessed(itemp);
        }
    });
    LOG.setCompleted(itemp);
    Collections.sort(solution);
    LOG.statistics(etime.end());
    LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size()));
    return new FrequentItemsetsResult("FP-Growth", "fp-growth", solution, meta, relation.size());
}
Also used : BitVector(de.lmu.ifi.dbs.elki.data.BitVector) ArrayList(java.util.ArrayList) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration) FrequentItemsetsResult(de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 39 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class Eclat method run.

/**
 * Run the Eclat algorithm
 *
 * @param db Database to process
 * @param relation Bit vector relation
 * @return Frequent patterns found
 */
public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) {
    // TODO: implement with resizable arrays, to not need dim.
    final int dim = RelationUtil.dimensionality(relation);
    final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
    // Compute absolute minsupport
    final int minsupp = getMinimumSupport(relation.size());
    LOG.verbose("Build 1-dimensional transaction lists.");
    Duration ctime = LOG.newDuration(STAT + "eclat.transposition.time").begin();
    DBIDs[] idx = buildIndex(relation, dim, minsupp);
    LOG.statistics(ctime.end());
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Building frequent itemsets", idx.length, LOG) : null;
    Duration etime = LOG.newDuration(STAT + "eclat.extraction.time").begin();
    final List<Itemset> solution = new ArrayList<>();
    for (int i = 0; i < idx.length; i++) {
        LOG.incrementProcessed(prog);
        extractItemsets(idx, i, minsupp, solution);
    }
    LOG.ensureCompleted(prog);
    Collections.sort(solution);
    LOG.statistics(etime.end());
    LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size()));
    return new FrequentItemsetsResult("Eclat", "eclat", solution, meta, relation.size());
}
Also used : BitVector(de.lmu.ifi.dbs.elki.data.BitVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) HashSetDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetDBIDs) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) ArrayList(java.util.ArrayList) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration) FrequentItemsetsResult(de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult)

Example 40 with LongStatistic

use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.

the class NNDescent method preprocess.

@Override
protected void preprocess() {
    final DBIDs ids = relation.getDBIDs();
    final long starttime = System.currentTimeMillis();
    IndefiniteProgress progress = LOG.isVerbose() ? new IndefiniteProgress("KNNGraph iteration", LOG) : null;
    // to add query point itself in the end, internally (k-1) is used
    final int internal_k = k - 1;
    // kNN store
    store = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, KNNHeap.class);
    // store for new reverse neighbors
    WritableDataStore<HashSetModifiableDBIDs> newReverseNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
    // store for new reverse neighbors
    WritableDataStore<HashSetModifiableDBIDs> oldReverseNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
    // Sample of new forward neighbors.
    WritableDataStore<HashSetModifiableDBIDs> sampleNewNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
    // data structures for new and sampled new neighbors
    WritableDataStore<HashSetModifiableDBIDs> flag = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
    // Initialize data structures:
    for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
        store.put(iditer, DBIDUtil.newHeap(internal_k));
        newReverseNeighbors.put(iditer, DBIDUtil.newHashSet());
        oldReverseNeighbors.put(iditer, DBIDUtil.newHashSet());
    }
    // this variable is the sampling size
    final int items = (int) Math.ceil(rho * internal_k);
    long counter_all = 0;
    // initialize neighbors (depends on -setInitialNeighbors option)
    for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
        // initialize sampled NN
        ModifiableDBIDs sampleNew = DBIDUtil.randomSampleExcept(ids, iditer, items, rnd);
        sampleNewNeighbors.put(iditer, DBIDUtil.newHashSet(sampleNew));
        // initialize RNN
        ModifiableDBIDs sampleRev = DBIDUtil.randomSampleExcept(ids, iditer, items, rnd);
        newReverseNeighbors.put(iditer, DBIDUtil.newHashSet(sampleRev));
        // initialize new neighbors
        flag.put(iditer, DBIDUtil.newHashSet());
        // initialize store
        if (!noInitialNeighbors) {
            HashSetModifiableDBIDs flags = flag.get(iditer);
            for (DBIDIter siter = sampleNew.iter(); siter.valid(); siter.advance()) {
                if (add(iditer, siter, distanceQuery.distance(iditer, siter))) {
                    flags.add(siter);
                }
            }
            counter_all += sampleNew.size();
        }
    }
    final int size = relation.size();
    double rate = 0.0;
    int iter = 0;
    for (; iter < iterations; iter++) {
        long counter = 0;
        // iterate through dataset
        for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
            // determine new and old neighbors
            HashSetModifiableDBIDs newNeighbors = flag.get(iditer);
            HashSetModifiableDBIDs oldNeighbors = DBIDUtil.newHashSet();
            KNNHeap heap = store.get(iditer);
            for (DoubleDBIDListIter heapiter = heap.unorderedIterator(); heapiter.valid(); heapiter.advance()) {
                if (!newNeighbors.contains(heapiter)) {
                    oldNeighbors.add(heapiter);
                }
            }
            // Sampling
            HashSetModifiableDBIDs sampleNew = sampleNewNeighbors.get(iditer);
            HashSetModifiableDBIDs newRev = newReverseNeighbors.get(iditer);
            newRev.removeDBIDs(sampleNew);
            boundSize(newRev, items);
            HashSetModifiableDBIDs oldRev = oldReverseNeighbors.get(iditer);
            oldRev.removeDBIDs(oldNeighbors);
            boundSize(oldRev, items);
            counter += processNewNeighbors(flag, sampleNew, oldNeighbors, newRev, oldRev);
        }
        counter_all += counter;
        if (LOG.isStatistics()) {
            LOG.statistics(new DoubleStatistic(prefix + ".scan-rate", counter_all * .5 / (size * (size - 1L))));
        }
        // t is the number of new neighbors
        int t = sampleNew(ids, sampleNewNeighbors, flag, items);
        // calculate old and new reverse neighbors
        clearAll(ids, newReverseNeighbors);
        clearAll(ids, oldReverseNeighbors);
        reverse(sampleNewNeighbors, newReverseNeighbors, oldReverseNeighbors);
        rate = (double) t / (double) (internal_k * size);
        if (LOG.isStatistics()) {
            LOG.statistics(new DoubleStatistic(prefix + ".update-rate", rate));
        }
        if (counter < delta * internal_k * size) {
            LOG.verbose("KNNGraph terminated because we performaned delta*k*size distance computations.");
            break;
        }
        if (rate < delta) {
            LOG.verbose("KNNGraph terminated because update rate got smaller than delta.");
            break;
        }
        LOG.incrementProcessed(progress);
    }
    if (LOG.isVerbose() && iter == iterations) {
        LOG.verbose("KNNGraph terminated because the maximum number of iterations was reached.");
    }
    LOG.setCompleted(progress);
    // convert store to storage
    storage = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_DB, KNNList.class);
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        KNNHeap tempHeap = DBIDUtil.newHeap(k);
        // Add query point and convert heap to list:
        KNNHeap heap = store.get(iditer);
        tempHeap.insert(0, iditer);
        for (DoubleDBIDListIter heapiter = heap.unorderedIterator(); heapiter.valid(); heapiter.advance()) {
            tempHeap.insert(heapiter.doubleValue(), heapiter);
        }
        storage.put(iditer, tempHeap.toKNNList());
    }
    final long end = System.currentTimeMillis();
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(prefix + ".construction-time.ms", end - starttime));
    }
}
Also used : DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Aggregations

LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)44 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)27 ArrayList (java.util.ArrayList)20 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)19 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)17 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)14 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)14 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)14 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)12 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)11 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)10 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)9 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)8 EvaluationResult (de.lmu.ifi.dbs.elki.result.EvaluationResult)7 MeasurementGroup (de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup)7 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)5 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)5 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)4 Logging (de.lmu.ifi.dbs.elki.logging.Logging)4 Duration (de.lmu.ifi.dbs.elki.logging.statistics.Duration)4