Search in sources :

Example 1 with FrequentItemsetsResult

use of de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult in project elki by elki-project.

the class FPGrowthTest method testLarge.

@Test
public void testLarge() {
    Database db = loadTransactions(UNITTEST + "itemsets/zutaten.txt.gz", 16401);
    FrequentItemsetsResult res = // 
    new ELKIBuilder<>(FPGrowth.class).with(FPGrowth.Parameterizer.MINSUPP_ID, 200).build().run(db);
    assertEquals("Size not as expected.", 184, res.getItemsets().size());
}
Also used : Database(de.lmu.ifi.dbs.elki.database.Database) FrequentItemsetsResult(de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult) Test(org.junit.Test)

Example 2 with FrequentItemsetsResult

use of de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult in project elki by elki-project.

the class DiSHPreferenceVectorIndex method determinePreferenceVectorByApriori.

/**
 * Determines the preference vector with the apriori strategy.
 *
 * @param relation the database storing the objects
 * @param neighborIDs the list of ids of the neighbors in each dimension
 * @param msg a string buffer for debug messages
 * @return the preference vector
 */
private long[] determinePreferenceVectorByApriori(Relation<V> relation, ModifiableDBIDs[] neighborIDs, StringBuilder msg) {
    int dimensionality = neighborIDs.length;
    // database for apriori
    UpdatableDatabase apriori_db = new HashmapDatabase();
    SimpleTypeInformation<?> bitmeta = VectorFieldTypeInformation.typeRequest(BitVector.class, dimensionality, dimensionality);
    for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
        long[] bits = BitsUtil.zero(dimensionality);
        boolean allFalse = true;
        for (int d = 0; d < dimensionality; d++) {
            if (neighborIDs[d].contains(it)) {
                BitsUtil.setI(bits, d);
                allFalse = false;
            }
        }
        if (!allFalse) {
            SingleObjectBundle oaa = new SingleObjectBundle();
            oaa.append(bitmeta, new BitVector(bits, dimensionality));
            apriori_db.insert(oaa);
        }
    }
    APRIORI apriori = new APRIORI(minpts);
    FrequentItemsetsResult aprioriResult = apriori.run(apriori_db);
    // result of apriori
    List<Itemset> frequentItemsets = aprioriResult.getItemsets();
    if (msg != null) {
        msg.append("\n Frequent itemsets: ").append(frequentItemsets);
    }
    int maxSupport = 0;
    int maxCardinality = 0;
    long[] preferenceVector = BitsUtil.zero(dimensionality);
    for (Itemset itemset : frequentItemsets) {
        if ((maxCardinality < itemset.length()) || (maxCardinality == itemset.length() && maxSupport == itemset.getSupport())) {
            preferenceVector = Itemset.toBitset(itemset, BitsUtil.zero(dimensionality));
            maxCardinality = itemset.length();
            maxSupport = itemset.getSupport();
        }
    }
    if (msg != null) {
        // 
        msg.append("\n preference ").append(// 
        BitsUtil.toStringLow(preferenceVector, dimensionality)).append('\n');
        LOG.debugFine(msg.toString());
    }
    return preferenceVector;
}
Also used : UpdatableDatabase(de.lmu.ifi.dbs.elki.database.UpdatableDatabase) BitVector(de.lmu.ifi.dbs.elki.data.BitVector) HashmapDatabase(de.lmu.ifi.dbs.elki.database.HashmapDatabase) SingleObjectBundle(de.lmu.ifi.dbs.elki.datasource.bundle.SingleObjectBundle) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) FrequentItemsetsResult(de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult) Itemset(de.lmu.ifi.dbs.elki.algorithm.itemsetmining.Itemset) APRIORI(de.lmu.ifi.dbs.elki.algorithm.itemsetmining.APRIORI)

Example 3 with FrequentItemsetsResult

use of de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult in project elki by elki-project.

the class APRIORI method run.

/**
 * Performs the APRIORI algorithm on the given database.
 *
 * @param relation the Relation to process
 * @return the AprioriResult learned by this APRIORI
 */
public FrequentItemsetsResult run(Relation<BitVector> relation) {
    DBIDs ids = relation.getDBIDs();
    List<Itemset> solution = new ArrayList<>();
    final int size = ids.size();
    final int needed = getMinimumSupport(size);
    // TODO: we don't strictly require a vector field.
    // We could work with knowing just the maximum dimensionality beforehand.
    VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
    if (size > 0) {
        final int dim = meta.getDimensionality();
        Duration timeone = LOG.newDuration(STAT + "1-items.time").begin();
        List<OneItemset> oneitems = buildFrequentOneItemsets(relation, dim, needed);
        LOG.statistics(timeone.end());
        if (LOG.isStatistics()) {
            LOG.statistics(new LongStatistic(STAT + "1-items.frequent", oneitems.size()));
            LOG.statistics(new LongStatistic(STAT + "1-items.transactions", ids.size()));
        }
        if (LOG.isDebuggingFine()) {
            LOG.debugFine(debugDumpCandidates(new StringBuilder(), oneitems, meta));
        }
        if (minlength <= 1) {
            solution.addAll(oneitems);
        }
        if (oneitems.size() >= 2 && maxlength >= 2) {
            Duration timetwo = LOG.newDuration(STAT + "2-items.time").begin();
            ArrayModifiableDBIDs survivors = DBIDUtil.newArray(ids.size());
            List<? extends Itemset> candidates = buildFrequentTwoItemsets(oneitems, relation, dim, needed, ids, survivors);
            // Continue with reduced set of transactions.
            ids = survivors;
            LOG.statistics(timetwo.end());
            if (LOG.isStatistics()) {
                LOG.statistics(new LongStatistic(STAT + "2-items.frequent", candidates.size()));
                LOG.statistics(new LongStatistic(STAT + "2-items.transactions", ids.size()));
            }
            if (LOG.isDebuggingFine()) {
                LOG.debugFine(debugDumpCandidates(new StringBuilder(), candidates, meta));
            }
            if (minlength <= 2) {
                solution.addAll(candidates);
            }
            for (int length = 3; length <= maxlength && candidates.size() >= length; length++) {
                Duration timel = LOG.newDuration(STAT + length + "-items.time").begin();
                // Join to get the new candidates
                candidates = aprioriGenerate(candidates, length, dim);
                if (LOG.isDebuggingFinest()) {
                    LOG.debugFinest(debugDumpCandidates(new StringBuilder().append("Before pruning: "), candidates, meta));
                }
                survivors = DBIDUtil.newArray(ids.size());
                candidates = frequentItemsets(candidates, relation, needed, ids, survivors, length);
                // Continue with reduced set of transactions.
                ids = survivors;
                LOG.statistics(timel.end());
                if (LOG.isStatistics()) {
                    LOG.statistics(new LongStatistic(STAT + length + "-items.frequent", candidates.size()));
                    LOG.statistics(new LongStatistic(STAT + length + "-items.transactions", ids.size()));
                }
                if (LOG.isDebuggingFine()) {
                    LOG.debugFine(debugDumpCandidates(new StringBuilder(), candidates, meta));
                }
                solution.addAll(candidates);
            }
        }
    }
    return new FrequentItemsetsResult("APRIORI", "apriori", solution, meta, size);
}
Also used : BitVector(de.lmu.ifi.dbs.elki.data.BitVector) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ArrayList(java.util.ArrayList) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration) FrequentItemsetsResult(de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 4 with FrequentItemsetsResult

use of de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult in project elki by elki-project.

the class FPGrowth method run.

/**
 * Run the FP-Growth algorithm
 *
 * @param db Database to process
 * @param relation Bit vector relation
 * @return Frequent patterns found
 */
public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) {
    // TODO: implement with resizable array, to not need dim.
    final int dim = RelationUtil.dimensionality(relation);
    final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
    // Compute absolute minsupport
    final int minsupp = getMinimumSupport(relation.size());
    LOG.verbose("Finding item frequencies for ordering.");
    final int[] counts = countItemSupport(relation, dim);
    // Forward and backward indexes
    int[] iidx = new int[dim];
    final int[] idx = buildIndex(counts, iidx, minsupp);
    final int items = idx.length;
    LOG.statistics(new LongStatistic(STAT + "raw-items", dim));
    LOG.statistics(new LongStatistic(STAT + "raw-transactions", relation.size()));
    LOG.statistics(new DoubleStatistic(STAT + "minsupp-relative", minsupp / (double) relation.size()));
    LOG.statistics(new LongStatistic(STAT + "minsupp-absolute", minsupp));
    LOG.verbose("Building FP-Tree.");
    Duration ctime = LOG.newDuration(STAT + "fp-tree.construction.time").begin();
    FPTree tree = buildFPTree(relation, iidx, items);
    if (LOG.isStatistics()) {
        tree.logStatistics();
    }
    if (LOG.isDebuggingFinest()) {
        StringBuilder buf = new StringBuilder(10000).append("FP-tree:\n");
        tree.appendTo(buf, new FPNode.Translator() {

            @Override
            public StringBuilder appendTo(StringBuilder buf, int i) {
                String l = meta.getLabel(idx[i]);
                return (l != null) ? buf.append(l) : buf.append(i);
            }
        });
        LOG.debugFinest(buf.toString());
    }
    // Reduce memory usage:
    tree.reduceMemory();
    LOG.statistics(ctime.end());
    LOG.verbose("Extracting frequent patterns.");
    Duration etime = LOG.newDuration(STAT + "fp-growth.extraction.time").begin();
    final IndefiniteProgress itemp = LOG.isVerbose() ? new IndefiniteProgress("Frequent itemsets", LOG) : null;
    final List<Itemset> solution = new ArrayList<>();
    // Start extraction with the least frequent items
    tree.extract(minsupp, minlength, maxlength, true, new FPTree.Collector() {

        @Override
        public void collect(int support, int[] data, int start, int plen) {
            // Always translate the indexes back to the original values via 'idx'!
            if (plen - start == 1) {
                solution.add(new OneItemset(idx[data[start]], support));
                LOG.incrementProcessed(itemp);
                return;
            }
            // Copy from buffer to a permanent storage
            int[] indices = new int[plen - start];
            for (int i = start, j = 0; i < plen; i++) {
                // Translate to original items
                indices[j++] = idx[data[i]];
            }
            Arrays.sort(indices);
            solution.add(new SparseItemset(indices, support));
            LOG.incrementProcessed(itemp);
        }
    });
    LOG.setCompleted(itemp);
    Collections.sort(solution);
    LOG.statistics(etime.end());
    LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size()));
    return new FrequentItemsetsResult("FP-Growth", "fp-growth", solution, meta, relation.size());
}
Also used : BitVector(de.lmu.ifi.dbs.elki.data.BitVector) ArrayList(java.util.ArrayList) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration) FrequentItemsetsResult(de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 5 with FrequentItemsetsResult

use of de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult in project elki by elki-project.

the class APRIORITest method testLarge.

@Test
public void testLarge() {
    Database db = loadTransactions(UNITTEST + "itemsets/zutaten.txt.gz", 16401);
    FrequentItemsetsResult res = // 
    new ELKIBuilder<>(APRIORI.class).with(APRIORI.Parameterizer.MINSUPP_ID, 200).build().run(db);
    assertEquals("Size not as expected.", 184, res.getItemsets().size());
}
Also used : Database(de.lmu.ifi.dbs.elki.database.Database) FrequentItemsetsResult(de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult) Test(org.junit.Test)

Aggregations

FrequentItemsetsResult (de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult)7 BitVector (de.lmu.ifi.dbs.elki.data.BitVector)4 Database (de.lmu.ifi.dbs.elki.database.Database)3 Duration (de.lmu.ifi.dbs.elki.logging.statistics.Duration)3 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)3 ArrayList (java.util.ArrayList)3 Test (org.junit.Test)3 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)2 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)2 APRIORI (de.lmu.ifi.dbs.elki.algorithm.itemsetmining.APRIORI)1 Itemset (de.lmu.ifi.dbs.elki.algorithm.itemsetmining.Itemset)1 HashmapDatabase (de.lmu.ifi.dbs.elki.database.HashmapDatabase)1 UpdatableDatabase (de.lmu.ifi.dbs.elki.database.UpdatableDatabase)1 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)1 HashSetDBIDs (de.lmu.ifi.dbs.elki.database.ids.HashSetDBIDs)1 SingleObjectBundle (de.lmu.ifi.dbs.elki.datasource.bundle.SingleObjectBundle)1 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)1 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)1 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)1