Search in sources :

Example 6 with BitVector

use of de.lmu.ifi.dbs.elki.data.BitVector in project elki by elki-project.

the class FPGrowth method run.

/**
 * Run the FP-Growth algorithm
 *
 * @param db Database to process
 * @param relation Bit vector relation
 * @return Frequent patterns found
 */
public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) {
    // TODO: implement with resizable array, to not need dim.
    final int dim = RelationUtil.dimensionality(relation);
    final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
    // Compute absolute minsupport
    final int minsupp = getMinimumSupport(relation.size());
    LOG.verbose("Finding item frequencies for ordering.");
    final int[] counts = countItemSupport(relation, dim);
    // Forward and backward indexes
    int[] iidx = new int[dim];
    final int[] idx = buildIndex(counts, iidx, minsupp);
    final int items = idx.length;
    LOG.statistics(new LongStatistic(STAT + "raw-items", dim));
    LOG.statistics(new LongStatistic(STAT + "raw-transactions", relation.size()));
    LOG.statistics(new DoubleStatistic(STAT + "minsupp-relative", minsupp / (double) relation.size()));
    LOG.statistics(new LongStatistic(STAT + "minsupp-absolute", minsupp));
    LOG.verbose("Building FP-Tree.");
    Duration ctime = LOG.newDuration(STAT + "fp-tree.construction.time").begin();
    FPTree tree = buildFPTree(relation, iidx, items);
    if (LOG.isStatistics()) {
        tree.logStatistics();
    }
    if (LOG.isDebuggingFinest()) {
        StringBuilder buf = new StringBuilder(10000).append("FP-tree:\n");
        tree.appendTo(buf, new FPNode.Translator() {

            @Override
            public StringBuilder appendTo(StringBuilder buf, int i) {
                String l = meta.getLabel(idx[i]);
                return (l != null) ? buf.append(l) : buf.append(i);
            }
        });
        LOG.debugFinest(buf.toString());
    }
    // Reduce memory usage:
    tree.reduceMemory();
    LOG.statistics(ctime.end());
    LOG.verbose("Extracting frequent patterns.");
    Duration etime = LOG.newDuration(STAT + "fp-growth.extraction.time").begin();
    final IndefiniteProgress itemp = LOG.isVerbose() ? new IndefiniteProgress("Frequent itemsets", LOG) : null;
    final List<Itemset> solution = new ArrayList<>();
    // Start extraction with the least frequent items
    tree.extract(minsupp, minlength, maxlength, true, new FPTree.Collector() {

        @Override
        public void collect(int support, int[] data, int start, int plen) {
            // Always translate the indexes back to the original values via 'idx'!
            if (plen - start == 1) {
                solution.add(new OneItemset(idx[data[start]], support));
                LOG.incrementProcessed(itemp);
                return;
            }
            // Copy from buffer to a permanent storage
            int[] indices = new int[plen - start];
            for (int i = start, j = 0; i < plen; i++) {
                // Translate to original items
                indices[j++] = idx[data[i]];
            }
            Arrays.sort(indices);
            solution.add(new SparseItemset(indices, support));
            LOG.incrementProcessed(itemp);
        }
    });
    LOG.setCompleted(itemp);
    Collections.sort(solution);
    LOG.statistics(etime.end());
    LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size()));
    return new FrequentItemsetsResult("FP-Growth", "fp-growth", solution, meta, relation.size());
}
Also used : BitVector(de.lmu.ifi.dbs.elki.data.BitVector) ArrayList(java.util.ArrayList) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration) FrequentItemsetsResult(de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 7 with BitVector

use of de.lmu.ifi.dbs.elki.data.BitVector in project elki by elki-project.

the class Eclat method run.

/**
 * Run the Eclat algorithm
 *
 * @param db Database to process
 * @param relation Bit vector relation
 * @return Frequent patterns found
 */
public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) {
    // TODO: implement with resizable arrays, to not need dim.
    final int dim = RelationUtil.dimensionality(relation);
    final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
    // Compute absolute minsupport
    final int minsupp = getMinimumSupport(relation.size());
    LOG.verbose("Build 1-dimensional transaction lists.");
    Duration ctime = LOG.newDuration(STAT + "eclat.transposition.time").begin();
    DBIDs[] idx = buildIndex(relation, dim, minsupp);
    LOG.statistics(ctime.end());
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Building frequent itemsets", idx.length, LOG) : null;
    Duration etime = LOG.newDuration(STAT + "eclat.extraction.time").begin();
    final List<Itemset> solution = new ArrayList<>();
    for (int i = 0; i < idx.length; i++) {
        LOG.incrementProcessed(prog);
        extractItemsets(idx, i, minsupp, solution);
    }
    LOG.ensureCompleted(prog);
    Collections.sort(solution);
    LOG.statistics(etime.end());
    LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size()));
    return new FrequentItemsetsResult("Eclat", "eclat", solution, meta, relation.size());
}
Also used : BitVector(de.lmu.ifi.dbs.elki.data.BitVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) HashSetDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetDBIDs) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) ArrayList(java.util.ArrayList) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration) FrequentItemsetsResult(de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult)

Example 8 with BitVector

use of de.lmu.ifi.dbs.elki.data.BitVector in project elki by elki-project.

the class SimpleTransactionParser method nextEvent.

@Override
public Event nextEvent() {
    if (nextevent != null) {
        Event ret = nextevent;
        nextevent = null;
        return ret;
    }
    try {
        while (reader.nextLineExceptComments()) {
            // Don't reuse bitsets, will not be copied by BitVector constructor.
            buf.clear();
            for (; /* initialized by nextLineExceptComments() */
            tokenizer.valid(); tokenizer.advance()) {
                String token = tokenizer.getSubstring();
                int t = keymap.getInt(token);
                if (t < 0) {
                    t = keymap.size();
                    keymap.put(token, t);
                }
                final int word = t >>> 6;
                final int off = t & 0x3F;
                while (word >= buf.size()) {
                    // Ensure size.
                    buf.add(0L);
                }
                buf.set(word, buf.getLong(word) | (1L << off));
            }
            curvec = new BitVector(buf.toLongArray(), keymap.size());
            return Event.NEXT_OBJECT;
        }
        nextevent = Event.END_OF_STREAM;
        // Construct final metadata:
        meta = new BundleMeta(1);
        String[] colnames = new String[keymap.size()];
        for (ObjectIterator<Object2IntMap.Entry<String>> iter = keymap.object2IntEntrySet().fastIterator(); iter.hasNext(); ) {
            Object2IntMap.Entry<String> entry = iter.next();
            colnames[entry.getIntValue()] = entry.getKey();
        }
        meta.add(new VectorFieldTypeInformation<>(BitVector.FACTORY, colnames.length, colnames));
        // Force a final meta update.
        return Event.META_CHANGED;
    } catch (IOException e) {
        throw new IllegalArgumentException("Error while parsing line " + reader.getLineNumber() + ".");
    }
}
Also used : BitVector(de.lmu.ifi.dbs.elki.data.BitVector) BundleMeta(de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta) Object2IntMap(it.unimi.dsi.fastutil.objects.Object2IntMap) IOException(java.io.IOException)

Aggregations

BitVector (de.lmu.ifi.dbs.elki.data.BitVector)8 ArrayList (java.util.ArrayList)5 FrequentItemsetsResult (de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult)4 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)3 Duration (de.lmu.ifi.dbs.elki.logging.statistics.Duration)3 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)3 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)2 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)2 APRIORI (de.lmu.ifi.dbs.elki.algorithm.itemsetmining.APRIORI)1 Itemset (de.lmu.ifi.dbs.elki.algorithm.itemsetmining.Itemset)1 HashmapDatabase (de.lmu.ifi.dbs.elki.database.HashmapDatabase)1 UpdatableDatabase (de.lmu.ifi.dbs.elki.database.UpdatableDatabase)1 HashSetDBIDs (de.lmu.ifi.dbs.elki.database.ids.HashSetDBIDs)1 BundleMeta (de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta)1 SingleObjectBundle (de.lmu.ifi.dbs.elki.datasource.bundle.SingleObjectBundle)1 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)1 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)1 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)1 Object2IntMap (it.unimi.dsi.fastutil.objects.Object2IntMap)1 IOException (java.io.IOException)1