Search in sources :

Example 1 with InconsistentDataException

use of de.lmu.ifi.dbs.elki.utilities.exceptions.InconsistentDataException in project elki by elki-project.

the class APRIORI method aprioriGenerate.

/**
 * Prunes a given set of candidates to keep only those BitSets where all
 * subsets of bits flipping one bit are frequent already.
 *
 * @param supported Support map
 * @param length Itemset length
 * @param dim Dimensionality
 * @return itemsets that cannot be pruned by apriori
 */
protected List<Itemset> aprioriGenerate(List<? extends Itemset> supported, int length, int dim) {
    if (supported.size() < length) {
        return Collections.emptyList();
    }
    long joined = 0L;
    final int ssize = supported.size();
    List<Itemset> candidateList = new ArrayList<>();
    Itemset ref = supported.get(0);
    if (ref instanceof SparseItemset) {
        // TODO: we currently never switch to DenseItemSet. This may however be
        // beneficial when we have few dimensions and many candidates.
        // E.g. when length > 32 and dim < 100. But this needs benchmarking!
        // For length < 5 and dim > 3000, SparseItemset unsurprisingly was faster
        // Scratch item to use for searching.
        SparseItemset scratch = new SparseItemset(new int[length - 1]);
        for (int i = 0; i < ssize; i++) {
            SparseItemset ii = (SparseItemset) supported.get(i);
            prefix: for (int j = i + 1; j < ssize; j++) {
                SparseItemset ij = (SparseItemset) supported.get(j);
                if (!ii.prefixTest(ij)) {
                    // Prefix doesn't match
                    break prefix;
                }
                joined++;
                // Test subsets (re-) using scratch object
                System.arraycopy(ii.indices, 1, scratch.indices, 0, length - 2);
                scratch.indices[length - 2] = ij.indices[length - 2];
                for (int k = length - 3; k >= 0; k--) {
                    scratch.indices[k] = ii.indices[k + 1];
                    int pos = Collections.binarySearch(supported, scratch);
                    if (pos < 0) {
                        // Prefix was okay, but one other subset was not frequent
                        continue prefix;
                    }
                }
                int[] items = new int[length];
                System.arraycopy(ii.indices, 0, items, 0, length - 1);
                items[length - 1] = ij.indices[length - 2];
                candidateList.add(new SparseItemset(items));
            }
        }
    } else if (ref instanceof DenseItemset) {
        // Scratch item to use for searching.
        DenseItemset scratch = new DenseItemset(BitsUtil.zero(dim), length - 1);
        for (int i = 0; i < ssize; i++) {
            DenseItemset ii = (DenseItemset) supported.get(i);
            prefix: for (int j = i + 1; j < ssize; j++) {
                DenseItemset ij = (DenseItemset) supported.get(j);
                // Prefix test via "|i1 ^ i2| = 2"
                System.arraycopy(ii.items, 0, scratch.items, 0, ii.items.length);
                BitsUtil.xorI(scratch.items, ij.items);
                if (BitsUtil.cardinality(scratch.items) != 2) {
                    // No prefix match; since sorted, no more can follow!
                    break prefix;
                }
                ++joined;
                // Ensure that the first difference is the last item in ii:
                int first = BitsUtil.nextSetBit(scratch.items, 0);
                if (BitsUtil.nextSetBit(ii.items, first + 1) > -1) {
                    // Different overlap by chance?
                    break prefix;
                }
                BitsUtil.orI(scratch.items, ij.items);
                // Test subsets.
                for (int l = length, b = BitsUtil.nextSetBit(scratch.items, 0); l > 2; l--, b = BitsUtil.nextSetBit(scratch.items, b + 1)) {
                    BitsUtil.clearI(scratch.items, b);
                    int pos = Collections.binarySearch(supported, scratch);
                    if (pos < 0) {
                        continue prefix;
                    }
                    BitsUtil.setI(scratch.items, b);
                }
                candidateList.add(new DenseItemset(scratch.items.clone(), length));
            }
        }
    } else {
        throw new InconsistentDataException("Unexpected itemset type " + ref.getClass());
    }
    if (LOG.isStatistics()) {
        // Naive pairwise approach
        LOG.statistics(new LongStatistic(STAT + length + "-items.pairwise", (ssize * ((long) ssize - 1))));
        LOG.statistics(new LongStatistic(STAT + length + "-items.joined", joined));
        LOG.statistics(new LongStatistic(STAT + length + "-items.candidates", candidateList.size()));
    }
    // So we do not need to sort here.
    return candidateList;
}
Also used : LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ArrayList(java.util.ArrayList) InconsistentDataException(de.lmu.ifi.dbs.elki.utilities.exceptions.InconsistentDataException)

Aggregations

LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)1 InconsistentDataException (de.lmu.ifi.dbs.elki.utilities.exceptions.InconsistentDataException)1 ArrayList (java.util.ArrayList)1