use of de.lmu.ifi.dbs.elki.utilities.exceptions.InconsistentDataException in project elki by elki-project.
the class APRIORI method aprioriGenerate.
/**
* Prunes a given set of candidates to keep only those BitSets where all
* subsets of bits flipping one bit are frequent already.
*
* @param supported Support map
* @param length Itemset length
* @param dim Dimensionality
* @return itemsets that cannot be pruned by apriori
*/
protected List<Itemset> aprioriGenerate(List<? extends Itemset> supported, int length, int dim) {
if (supported.size() < length) {
return Collections.emptyList();
}
long joined = 0L;
final int ssize = supported.size();
List<Itemset> candidateList = new ArrayList<>();
Itemset ref = supported.get(0);
if (ref instanceof SparseItemset) {
// TODO: we currently never switch to DenseItemSet. This may however be
// beneficial when we have few dimensions and many candidates.
// E.g. when length > 32 and dim < 100. But this needs benchmarking!
// For length < 5 and dim > 3000, SparseItemset unsurprisingly was faster
// Scratch item to use for searching.
SparseItemset scratch = new SparseItemset(new int[length - 1]);
for (int i = 0; i < ssize; i++) {
SparseItemset ii = (SparseItemset) supported.get(i);
prefix: for (int j = i + 1; j < ssize; j++) {
SparseItemset ij = (SparseItemset) supported.get(j);
if (!ii.prefixTest(ij)) {
// Prefix doesn't match
break prefix;
}
joined++;
// Test subsets (re-) using scratch object
System.arraycopy(ii.indices, 1, scratch.indices, 0, length - 2);
scratch.indices[length - 2] = ij.indices[length - 2];
for (int k = length - 3; k >= 0; k--) {
scratch.indices[k] = ii.indices[k + 1];
int pos = Collections.binarySearch(supported, scratch);
if (pos < 0) {
// Prefix was okay, but one other subset was not frequent
continue prefix;
}
}
int[] items = new int[length];
System.arraycopy(ii.indices, 0, items, 0, length - 1);
items[length - 1] = ij.indices[length - 2];
candidateList.add(new SparseItemset(items));
}
}
} else if (ref instanceof DenseItemset) {
// Scratch item to use for searching.
DenseItemset scratch = new DenseItemset(BitsUtil.zero(dim), length - 1);
for (int i = 0; i < ssize; i++) {
DenseItemset ii = (DenseItemset) supported.get(i);
prefix: for (int j = i + 1; j < ssize; j++) {
DenseItemset ij = (DenseItemset) supported.get(j);
// Prefix test via "|i1 ^ i2| = 2"
System.arraycopy(ii.items, 0, scratch.items, 0, ii.items.length);
BitsUtil.xorI(scratch.items, ij.items);
if (BitsUtil.cardinality(scratch.items) != 2) {
// No prefix match; since sorted, no more can follow!
break prefix;
}
++joined;
// Ensure that the first difference is the last item in ii:
int first = BitsUtil.nextSetBit(scratch.items, 0);
if (BitsUtil.nextSetBit(ii.items, first + 1) > -1) {
// Different overlap by chance?
break prefix;
}
BitsUtil.orI(scratch.items, ij.items);
// Test subsets.
for (int l = length, b = BitsUtil.nextSetBit(scratch.items, 0); l > 2; l--, b = BitsUtil.nextSetBit(scratch.items, b + 1)) {
BitsUtil.clearI(scratch.items, b);
int pos = Collections.binarySearch(supported, scratch);
if (pos < 0) {
continue prefix;
}
BitsUtil.setI(scratch.items, b);
}
candidateList.add(new DenseItemset(scratch.items.clone(), length));
}
}
} else {
throw new InconsistentDataException("Unexpected itemset type " + ref.getClass());
}
if (LOG.isStatistics()) {
// Naive pairwise approach
LOG.statistics(new LongStatistic(STAT + length + "-items.pairwise", (ssize * ((long) ssize - 1))));
LOG.statistics(new LongStatistic(STAT + length + "-items.joined", joined));
LOG.statistics(new LongStatistic(STAT + length + "-items.candidates", candidateList.size()));
}
// So we do not need to sort here.
return candidateList;
}
Aggregations