use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class APRIORI method aprioriGenerate.
/**
* Prunes a given set of candidates to keep only those BitSets where all
* subsets of bits flipping one bit are frequent already.
*
* @param supported Support map
* @param length Itemset length
* @param dim Dimensionality
* @return itemsets that cannot be pruned by apriori
*/
protected List<Itemset> aprioriGenerate(List<? extends Itemset> supported, int length, int dim) {
if (supported.size() < length) {
return Collections.emptyList();
}
long joined = 0L;
final int ssize = supported.size();
List<Itemset> candidateList = new ArrayList<>();
Itemset ref = supported.get(0);
if (ref instanceof SparseItemset) {
// TODO: we currently never switch to DenseItemSet. This may however be
// beneficial when we have few dimensions and many candidates.
// E.g. when length > 32 and dim < 100. But this needs benchmarking!
// For length < 5 and dim > 3000, SparseItemset unsurprisingly was faster
// Scratch item to use for searching.
SparseItemset scratch = new SparseItemset(new int[length - 1]);
for (int i = 0; i < ssize; i++) {
SparseItemset ii = (SparseItemset) supported.get(i);
prefix: for (int j = i + 1; j < ssize; j++) {
SparseItemset ij = (SparseItemset) supported.get(j);
if (!ii.prefixTest(ij)) {
// Prefix doesn't match
break prefix;
}
joined++;
// Test subsets (re-) using scratch object
System.arraycopy(ii.indices, 1, scratch.indices, 0, length - 2);
scratch.indices[length - 2] = ij.indices[length - 2];
for (int k = length - 3; k >= 0; k--) {
scratch.indices[k] = ii.indices[k + 1];
int pos = Collections.binarySearch(supported, scratch);
if (pos < 0) {
// Prefix was okay, but one other subset was not frequent
continue prefix;
}
}
int[] items = new int[length];
System.arraycopy(ii.indices, 0, items, 0, length - 1);
items[length - 1] = ij.indices[length - 2];
candidateList.add(new SparseItemset(items));
}
}
} else if (ref instanceof DenseItemset) {
// Scratch item to use for searching.
DenseItemset scratch = new DenseItemset(BitsUtil.zero(dim), length - 1);
for (int i = 0; i < ssize; i++) {
DenseItemset ii = (DenseItemset) supported.get(i);
prefix: for (int j = i + 1; j < ssize; j++) {
DenseItemset ij = (DenseItemset) supported.get(j);
// Prefix test via "|i1 ^ i2| = 2"
System.arraycopy(ii.items, 0, scratch.items, 0, ii.items.length);
BitsUtil.xorI(scratch.items, ij.items);
if (BitsUtil.cardinality(scratch.items) != 2) {
// No prefix match; since sorted, no more can follow!
break prefix;
}
++joined;
// Ensure that the first difference is the last item in ii:
int first = BitsUtil.nextSetBit(scratch.items, 0);
if (BitsUtil.nextSetBit(ii.items, first + 1) > -1) {
// Different overlap by chance?
break prefix;
}
BitsUtil.orI(scratch.items, ij.items);
// Test subsets.
for (int l = length, b = BitsUtil.nextSetBit(scratch.items, 0); l > 2; l--, b = BitsUtil.nextSetBit(scratch.items, b + 1)) {
BitsUtil.clearI(scratch.items, b);
int pos = Collections.binarySearch(supported, scratch);
if (pos < 0) {
continue prefix;
}
BitsUtil.setI(scratch.items, b);
}
candidateList.add(new DenseItemset(scratch.items.clone(), length));
}
}
} else {
throw new InconsistentDataException("Unexpected itemset type " + ref.getClass());
}
if (LOG.isStatistics()) {
// Naive pairwise approach
LOG.statistics(new LongStatistic(STAT + length + "-items.pairwise", (ssize * ((long) ssize - 1))));
LOG.statistics(new LongStatistic(STAT + length + "-items.joined", joined));
LOG.statistics(new LongStatistic(STAT + length + "-items.candidates", candidateList.size()));
}
// So we do not need to sort here.
return candidateList;
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class APRIORI method run.
/**
* Performs the APRIORI algorithm on the given database.
*
* @param relation the Relation to process
* @return the AprioriResult learned by this APRIORI
*/
public FrequentItemsetsResult run(Relation<BitVector> relation) {
DBIDs ids = relation.getDBIDs();
List<Itemset> solution = new ArrayList<>();
final int size = ids.size();
final int needed = getMinimumSupport(size);
// TODO: we don't strictly require a vector field.
// We could work with knowing just the maximum dimensionality beforehand.
VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
if (size > 0) {
final int dim = meta.getDimensionality();
Duration timeone = LOG.newDuration(STAT + "1-items.time").begin();
List<OneItemset> oneitems = buildFrequentOneItemsets(relation, dim, needed);
LOG.statistics(timeone.end());
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(STAT + "1-items.frequent", oneitems.size()));
LOG.statistics(new LongStatistic(STAT + "1-items.transactions", ids.size()));
}
if (LOG.isDebuggingFine()) {
LOG.debugFine(debugDumpCandidates(new StringBuilder(), oneitems, meta));
}
if (minlength <= 1) {
solution.addAll(oneitems);
}
if (oneitems.size() >= 2 && maxlength >= 2) {
Duration timetwo = LOG.newDuration(STAT + "2-items.time").begin();
ArrayModifiableDBIDs survivors = DBIDUtil.newArray(ids.size());
List<? extends Itemset> candidates = buildFrequentTwoItemsets(oneitems, relation, dim, needed, ids, survivors);
// Continue with reduced set of transactions.
ids = survivors;
LOG.statistics(timetwo.end());
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(STAT + "2-items.frequent", candidates.size()));
LOG.statistics(new LongStatistic(STAT + "2-items.transactions", ids.size()));
}
if (LOG.isDebuggingFine()) {
LOG.debugFine(debugDumpCandidates(new StringBuilder(), candidates, meta));
}
if (minlength <= 2) {
solution.addAll(candidates);
}
for (int length = 3; length <= maxlength && candidates.size() >= length; length++) {
Duration timel = LOG.newDuration(STAT + length + "-items.time").begin();
// Join to get the new candidates
candidates = aprioriGenerate(candidates, length, dim);
if (LOG.isDebuggingFinest()) {
LOG.debugFinest(debugDumpCandidates(new StringBuilder().append("Before pruning: "), candidates, meta));
}
survivors = DBIDUtil.newArray(ids.size());
candidates = frequentItemsets(candidates, relation, needed, ids, survivors, length);
// Continue with reduced set of transactions.
ids = survivors;
LOG.statistics(timel.end());
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(STAT + length + "-items.frequent", candidates.size()));
LOG.statistics(new LongStatistic(STAT + length + "-items.transactions", ids.size()));
}
if (LOG.isDebuggingFine()) {
LOG.debugFine(debugDumpCandidates(new StringBuilder(), candidates, meta));
}
solution.addAll(candidates);
}
}
}
return new FrequentItemsetsResult("APRIORI", "apriori", solution, meta, size);
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class FPGrowth method run.
/**
* Run the FP-Growth algorithm
*
* @param db Database to process
* @param relation Bit vector relation
* @return Frequent patterns found
*/
public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) {
// TODO: implement with resizable array, to not need dim.
final int dim = RelationUtil.dimensionality(relation);
final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
// Compute absolute minsupport
final int minsupp = getMinimumSupport(relation.size());
LOG.verbose("Finding item frequencies for ordering.");
final int[] counts = countItemSupport(relation, dim);
// Forward and backward indexes
int[] iidx = new int[dim];
final int[] idx = buildIndex(counts, iidx, minsupp);
final int items = idx.length;
LOG.statistics(new LongStatistic(STAT + "raw-items", dim));
LOG.statistics(new LongStatistic(STAT + "raw-transactions", relation.size()));
LOG.statistics(new DoubleStatistic(STAT + "minsupp-relative", minsupp / (double) relation.size()));
LOG.statistics(new LongStatistic(STAT + "minsupp-absolute", minsupp));
LOG.verbose("Building FP-Tree.");
Duration ctime = LOG.newDuration(STAT + "fp-tree.construction.time").begin();
FPTree tree = buildFPTree(relation, iidx, items);
if (LOG.isStatistics()) {
tree.logStatistics();
}
if (LOG.isDebuggingFinest()) {
StringBuilder buf = new StringBuilder(10000).append("FP-tree:\n");
tree.appendTo(buf, new FPNode.Translator() {
@Override
public StringBuilder appendTo(StringBuilder buf, int i) {
String l = meta.getLabel(idx[i]);
return (l != null) ? buf.append(l) : buf.append(i);
}
});
LOG.debugFinest(buf.toString());
}
// Reduce memory usage:
tree.reduceMemory();
LOG.statistics(ctime.end());
LOG.verbose("Extracting frequent patterns.");
Duration etime = LOG.newDuration(STAT + "fp-growth.extraction.time").begin();
final IndefiniteProgress itemp = LOG.isVerbose() ? new IndefiniteProgress("Frequent itemsets", LOG) : null;
final List<Itemset> solution = new ArrayList<>();
// Start extraction with the least frequent items
tree.extract(minsupp, minlength, maxlength, true, new FPTree.Collector() {
@Override
public void collect(int support, int[] data, int start, int plen) {
// Always translate the indexes back to the original values via 'idx'!
if (plen - start == 1) {
solution.add(new OneItemset(idx[data[start]], support));
LOG.incrementProcessed(itemp);
return;
}
// Copy from buffer to a permanent storage
int[] indices = new int[plen - start];
for (int i = start, j = 0; i < plen; i++) {
// Translate to original items
indices[j++] = idx[data[i]];
}
Arrays.sort(indices);
solution.add(new SparseItemset(indices, support));
LOG.incrementProcessed(itemp);
}
});
LOG.setCompleted(itemp);
Collections.sort(solution);
LOG.statistics(etime.end());
LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size()));
return new FrequentItemsetsResult("FP-Growth", "fp-growth", solution, meta, relation.size());
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class Eclat method run.
/**
* Run the Eclat algorithm
*
* @param db Database to process
* @param relation Bit vector relation
* @return Frequent patterns found
*/
public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) {
// TODO: implement with resizable arrays, to not need dim.
final int dim = RelationUtil.dimensionality(relation);
final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
// Compute absolute minsupport
final int minsupp = getMinimumSupport(relation.size());
LOG.verbose("Build 1-dimensional transaction lists.");
Duration ctime = LOG.newDuration(STAT + "eclat.transposition.time").begin();
DBIDs[] idx = buildIndex(relation, dim, minsupp);
LOG.statistics(ctime.end());
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Building frequent itemsets", idx.length, LOG) : null;
Duration etime = LOG.newDuration(STAT + "eclat.extraction.time").begin();
final List<Itemset> solution = new ArrayList<>();
for (int i = 0; i < idx.length; i++) {
LOG.incrementProcessed(prog);
extractItemsets(idx, i, minsupp, solution);
}
LOG.ensureCompleted(prog);
Collections.sort(solution);
LOG.statistics(etime.end());
LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size()));
return new FrequentItemsetsResult("Eclat", "eclat", solution, meta, relation.size());
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class NNDescent method preprocess.
@Override
protected void preprocess() {
final DBIDs ids = relation.getDBIDs();
final long starttime = System.currentTimeMillis();
IndefiniteProgress progress = LOG.isVerbose() ? new IndefiniteProgress("KNNGraph iteration", LOG) : null;
// to add query point itself in the end, internally (k-1) is used
final int internal_k = k - 1;
// kNN store
store = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, KNNHeap.class);
// store for new reverse neighbors
WritableDataStore<HashSetModifiableDBIDs> newReverseNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
// store for new reverse neighbors
WritableDataStore<HashSetModifiableDBIDs> oldReverseNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
// Sample of new forward neighbors.
WritableDataStore<HashSetModifiableDBIDs> sampleNewNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
// data structures for new and sampled new neighbors
WritableDataStore<HashSetModifiableDBIDs> flag = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
// Initialize data structures:
for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
store.put(iditer, DBIDUtil.newHeap(internal_k));
newReverseNeighbors.put(iditer, DBIDUtil.newHashSet());
oldReverseNeighbors.put(iditer, DBIDUtil.newHashSet());
}
// this variable is the sampling size
final int items = (int) Math.ceil(rho * internal_k);
long counter_all = 0;
// initialize neighbors (depends on -setInitialNeighbors option)
for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
// initialize sampled NN
ModifiableDBIDs sampleNew = DBIDUtil.randomSampleExcept(ids, iditer, items, rnd);
sampleNewNeighbors.put(iditer, DBIDUtil.newHashSet(sampleNew));
// initialize RNN
ModifiableDBIDs sampleRev = DBIDUtil.randomSampleExcept(ids, iditer, items, rnd);
newReverseNeighbors.put(iditer, DBIDUtil.newHashSet(sampleRev));
// initialize new neighbors
flag.put(iditer, DBIDUtil.newHashSet());
// initialize store
if (!noInitialNeighbors) {
HashSetModifiableDBIDs flags = flag.get(iditer);
for (DBIDIter siter = sampleNew.iter(); siter.valid(); siter.advance()) {
if (add(iditer, siter, distanceQuery.distance(iditer, siter))) {
flags.add(siter);
}
}
counter_all += sampleNew.size();
}
}
final int size = relation.size();
double rate = 0.0;
int iter = 0;
for (; iter < iterations; iter++) {
long counter = 0;
// iterate through dataset
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
// determine new and old neighbors
HashSetModifiableDBIDs newNeighbors = flag.get(iditer);
HashSetModifiableDBIDs oldNeighbors = DBIDUtil.newHashSet();
KNNHeap heap = store.get(iditer);
for (DoubleDBIDListIter heapiter = heap.unorderedIterator(); heapiter.valid(); heapiter.advance()) {
if (!newNeighbors.contains(heapiter)) {
oldNeighbors.add(heapiter);
}
}
// Sampling
HashSetModifiableDBIDs sampleNew = sampleNewNeighbors.get(iditer);
HashSetModifiableDBIDs newRev = newReverseNeighbors.get(iditer);
newRev.removeDBIDs(sampleNew);
boundSize(newRev, items);
HashSetModifiableDBIDs oldRev = oldReverseNeighbors.get(iditer);
oldRev.removeDBIDs(oldNeighbors);
boundSize(oldRev, items);
counter += processNewNeighbors(flag, sampleNew, oldNeighbors, newRev, oldRev);
}
counter_all += counter;
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(prefix + ".scan-rate", counter_all * .5 / (size * (size - 1L))));
}
// t is the number of new neighbors
int t = sampleNew(ids, sampleNewNeighbors, flag, items);
// calculate old and new reverse neighbors
clearAll(ids, newReverseNeighbors);
clearAll(ids, oldReverseNeighbors);
reverse(sampleNewNeighbors, newReverseNeighbors, oldReverseNeighbors);
rate = (double) t / (double) (internal_k * size);
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(prefix + ".update-rate", rate));
}
if (counter < delta * internal_k * size) {
LOG.verbose("KNNGraph terminated because we performaned delta*k*size distance computations.");
break;
}
if (rate < delta) {
LOG.verbose("KNNGraph terminated because update rate got smaller than delta.");
break;
}
LOG.incrementProcessed(progress);
}
if (LOG.isVerbose() && iter == iterations) {
LOG.verbose("KNNGraph terminated because the maximum number of iterations was reached.");
}
LOG.setCompleted(progress);
// convert store to storage
storage = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_DB, KNNList.class);
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
KNNHeap tempHeap = DBIDUtil.newHeap(k);
// Add query point and convert heap to list:
KNNHeap heap = store.get(iditer);
tempHeap.insert(0, iditer);
for (DoubleDBIDListIter heapiter = heap.unorderedIterator(); heapiter.valid(); heapiter.advance()) {
tempHeap.insert(heapiter.doubleValue(), heapiter);
}
storage.put(iditer, tempHeap.toKNNList());
}
final long end = System.currentTimeMillis();
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(prefix + ".construction-time.ms", end - starttime));
}
}
Aggregations