Search in sources :

Example 26 with IndefiniteProgress

use of de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress in project elki by elki-project.

the class AsciiDistanceParser method parse.

@Override
public void parse(InputStream in, DistanceCacheWriter cache) {
    reader.reset(in);
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("Parsing distance matrix", LOG) : null;
    try {
        int id1, id2;
        while (reader.nextLineExceptComments()) {
            if (!tokenizer.valid()) {
                throw new IllegalArgumentException("Less than three values in line " + reader.getLineNumber());
            }
            try {
                id1 = tokenizer.getIntBase10();
                tokenizer.advance();
            } catch (NumberFormatException e) {
                throw new IllegalArgumentException("Error in line " + reader.getLineNumber() + ": id1 is not an integer!");
            }
            if (!tokenizer.valid()) {
                throw new IllegalArgumentException("Less than three values in line " + reader.getLineNumber());
            }
            try {
                id2 = tokenizer.getIntBase10();
                tokenizer.advance();
            } catch (NumberFormatException e) {
                throw new IllegalArgumentException("Error in line " + reader.getLineNumber() + ": id2 is not an integer!");
            }
            if (!tokenizer.valid()) {
                throw new IllegalArgumentException("Less than three values in line " + reader.getLineNumber());
            }
            try {
                cache.put(id1, id2, tokenizer.getDouble());
            } catch (IllegalArgumentException e) {
                throw new IllegalArgumentException("Error in line " + reader.getLineNumber() + ":" + e.getMessage(), e);
            }
            tokenizer.advance();
            if (tokenizer.valid()) {
                throw new IllegalArgumentException("More than three values in line " + reader.getLineNumber());
            }
            LOG.incrementProcessed(prog);
        }
    } catch (IOException e) {
        throw new IllegalArgumentException("Error while parsing line " + reader.getLineNumber() + ".");
    }
    LOG.setCompleted(prog);
}
Also used : IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) IOException(java.io.IOException)

Example 27 with IndefiniteProgress

use of de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress in project elki by elki-project.

the class FPGrowth method run.

/**
 * Run the FP-Growth algorithm
 *
 * @param db Database to process
 * @param relation Bit vector relation
 * @return Frequent patterns found
 */
public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) {
    // TODO: implement with resizable array, to not need dim.
    final int dim = RelationUtil.dimensionality(relation);
    final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
    // Compute absolute minsupport
    final int minsupp = getMinimumSupport(relation.size());
    LOG.verbose("Finding item frequencies for ordering.");
    final int[] counts = countItemSupport(relation, dim);
    // Forward and backward indexes
    int[] iidx = new int[dim];
    final int[] idx = buildIndex(counts, iidx, minsupp);
    final int items = idx.length;
    LOG.statistics(new LongStatistic(STAT + "raw-items", dim));
    LOG.statistics(new LongStatistic(STAT + "raw-transactions", relation.size()));
    LOG.statistics(new DoubleStatistic(STAT + "minsupp-relative", minsupp / (double) relation.size()));
    LOG.statistics(new LongStatistic(STAT + "minsupp-absolute", minsupp));
    LOG.verbose("Building FP-Tree.");
    Duration ctime = LOG.newDuration(STAT + "fp-tree.construction.time").begin();
    FPTree tree = buildFPTree(relation, iidx, items);
    if (LOG.isStatistics()) {
        tree.logStatistics();
    }
    if (LOG.isDebuggingFinest()) {
        StringBuilder buf = new StringBuilder(10000).append("FP-tree:\n");
        tree.appendTo(buf, new FPNode.Translator() {

            @Override
            public StringBuilder appendTo(StringBuilder buf, int i) {
                String l = meta.getLabel(idx[i]);
                return (l != null) ? buf.append(l) : buf.append(i);
            }
        });
        LOG.debugFinest(buf.toString());
    }
    // Reduce memory usage:
    tree.reduceMemory();
    LOG.statistics(ctime.end());
    LOG.verbose("Extracting frequent patterns.");
    Duration etime = LOG.newDuration(STAT + "fp-growth.extraction.time").begin();
    final IndefiniteProgress itemp = LOG.isVerbose() ? new IndefiniteProgress("Frequent itemsets", LOG) : null;
    final List<Itemset> solution = new ArrayList<>();
    // Start extraction with the least frequent items
    tree.extract(minsupp, minlength, maxlength, true, new FPTree.Collector() {

        @Override
        public void collect(int support, int[] data, int start, int plen) {
            // Always translate the indexes back to the original values via 'idx'!
            if (plen - start == 1) {
                solution.add(new OneItemset(idx[data[start]], support));
                LOG.incrementProcessed(itemp);
                return;
            }
            // Copy from buffer to a permanent storage
            int[] indices = new int[plen - start];
            for (int i = start, j = 0; i < plen; i++) {
                // Translate to original items
                indices[j++] = idx[data[i]];
            }
            Arrays.sort(indices);
            solution.add(new SparseItemset(indices, support));
            LOG.incrementProcessed(itemp);
        }
    });
    LOG.setCompleted(itemp);
    Collections.sort(solution);
    LOG.statistics(etime.end());
    LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size()));
    return new FrequentItemsetsResult("FP-Growth", "fp-growth", solution, meta, relation.size());
}
Also used : BitVector(de.lmu.ifi.dbs.elki.data.BitVector) ArrayList(java.util.ArrayList) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration) FrequentItemsetsResult(de.lmu.ifi.dbs.elki.result.FrequentItemsetsResult) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 28 with IndefiniteProgress

use of de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress in project elki by elki-project.

the class NNDescent method preprocess.

@Override
protected void preprocess() {
    final DBIDs ids = relation.getDBIDs();
    final long starttime = System.currentTimeMillis();
    IndefiniteProgress progress = LOG.isVerbose() ? new IndefiniteProgress("KNNGraph iteration", LOG) : null;
    // to add query point itself in the end, internally (k-1) is used
    final int internal_k = k - 1;
    // kNN store
    store = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, KNNHeap.class);
    // store for new reverse neighbors
    WritableDataStore<HashSetModifiableDBIDs> newReverseNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
    // store for new reverse neighbors
    WritableDataStore<HashSetModifiableDBIDs> oldReverseNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
    // Sample of new forward neighbors.
    WritableDataStore<HashSetModifiableDBIDs> sampleNewNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
    // data structures for new and sampled new neighbors
    WritableDataStore<HashSetModifiableDBIDs> flag = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
    // Initialize data structures:
    for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
        store.put(iditer, DBIDUtil.newHeap(internal_k));
        newReverseNeighbors.put(iditer, DBIDUtil.newHashSet());
        oldReverseNeighbors.put(iditer, DBIDUtil.newHashSet());
    }
    // this variable is the sampling size
    final int items = (int) Math.ceil(rho * internal_k);
    long counter_all = 0;
    // initialize neighbors (depends on -setInitialNeighbors option)
    for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
        // initialize sampled NN
        ModifiableDBIDs sampleNew = DBIDUtil.randomSampleExcept(ids, iditer, items, rnd);
        sampleNewNeighbors.put(iditer, DBIDUtil.newHashSet(sampleNew));
        // initialize RNN
        ModifiableDBIDs sampleRev = DBIDUtil.randomSampleExcept(ids, iditer, items, rnd);
        newReverseNeighbors.put(iditer, DBIDUtil.newHashSet(sampleRev));
        // initialize new neighbors
        flag.put(iditer, DBIDUtil.newHashSet());
        // initialize store
        if (!noInitialNeighbors) {
            HashSetModifiableDBIDs flags = flag.get(iditer);
            for (DBIDIter siter = sampleNew.iter(); siter.valid(); siter.advance()) {
                if (add(iditer, siter, distanceQuery.distance(iditer, siter))) {
                    flags.add(siter);
                }
            }
            counter_all += sampleNew.size();
        }
    }
    final int size = relation.size();
    double rate = 0.0;
    int iter = 0;
    for (; iter < iterations; iter++) {
        long counter = 0;
        // iterate through dataset
        for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
            // determine new and old neighbors
            HashSetModifiableDBIDs newNeighbors = flag.get(iditer);
            HashSetModifiableDBIDs oldNeighbors = DBIDUtil.newHashSet();
            KNNHeap heap = store.get(iditer);
            for (DoubleDBIDListIter heapiter = heap.unorderedIterator(); heapiter.valid(); heapiter.advance()) {
                if (!newNeighbors.contains(heapiter)) {
                    oldNeighbors.add(heapiter);
                }
            }
            // Sampling
            HashSetModifiableDBIDs sampleNew = sampleNewNeighbors.get(iditer);
            HashSetModifiableDBIDs newRev = newReverseNeighbors.get(iditer);
            newRev.removeDBIDs(sampleNew);
            boundSize(newRev, items);
            HashSetModifiableDBIDs oldRev = oldReverseNeighbors.get(iditer);
            oldRev.removeDBIDs(oldNeighbors);
            boundSize(oldRev, items);
            counter += processNewNeighbors(flag, sampleNew, oldNeighbors, newRev, oldRev);
        }
        counter_all += counter;
        if (LOG.isStatistics()) {
            LOG.statistics(new DoubleStatistic(prefix + ".scan-rate", counter_all * .5 / (size * (size - 1L))));
        }
        // t is the number of new neighbors
        int t = sampleNew(ids, sampleNewNeighbors, flag, items);
        // calculate old and new reverse neighbors
        clearAll(ids, newReverseNeighbors);
        clearAll(ids, oldReverseNeighbors);
        reverse(sampleNewNeighbors, newReverseNeighbors, oldReverseNeighbors);
        rate = (double) t / (double) (internal_k * size);
        if (LOG.isStatistics()) {
            LOG.statistics(new DoubleStatistic(prefix + ".update-rate", rate));
        }
        if (counter < delta * internal_k * size) {
            LOG.verbose("KNNGraph terminated because we performaned delta*k*size distance computations.");
            break;
        }
        if (rate < delta) {
            LOG.verbose("KNNGraph terminated because update rate got smaller than delta.");
            break;
        }
        LOG.incrementProcessed(progress);
    }
    if (LOG.isVerbose() && iter == iterations) {
        LOG.verbose("KNNGraph terminated because the maximum number of iterations was reached.");
    }
    LOG.setCompleted(progress);
    // convert store to storage
    storage = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_DB, KNNList.class);
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        KNNHeap tempHeap = DBIDUtil.newHeap(k);
        // Add query point and convert heap to list:
        KNNHeap heap = store.get(iditer);
        tempHeap.insert(0, iditer);
        for (DoubleDBIDListIter heapiter = heap.unorderedIterator(); heapiter.valid(); heapiter.advance()) {
            tempHeap.insert(heapiter.doubleValue(), heapiter);
        }
        storage.put(iditer, tempHeap.toKNNList());
    }
    final long end = System.currentTimeMillis();
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(prefix + ".construction-time.ms", end - starttime));
    }
}
Also used : DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Aggregations

IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)28 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)20 ArrayList (java.util.ArrayList)16 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)15 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)14 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)14 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)14 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)13 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)11 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)11 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)8 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)7 Model (de.lmu.ifi.dbs.elki.data.model.Model)4 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)4 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)4 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)3 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)3 AbstractProjectedClustering (de.lmu.ifi.dbs.elki.algorithm.clustering.AbstractProjectedClustering)2 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)2 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)2