use of de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic in project elki by elki-project.
the class EvaluateSquaredErrors method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param c Clustering
* @return ssq
*/
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
boolean square = !distance.isSquared();
int ignorednoise = 0;
List<? extends Cluster<?>> clusters = c.getAllClusters();
double ssq = 0, sum = 0;
for (Cluster<?> cluster : clusters) {
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
ignorednoise += cluster.size();
continue;
case TREAT_NOISE_AS_SINGLETONS:
continue;
case MERGE_NOISE:
// Treat as cluster below:
break;
}
}
NumberVector center = ModelUtil.getPrototypeOrCentroid(cluster.getModel(), rel, cluster.getIDs());
for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
final double d = distance.distance(center, rel.get(it1));
sum += d;
ssq += square ? d * d : d;
}
}
final int div = Math.max(1, rel.size() - ignorednoise);
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(key + ".mean", sum / div));
LOG.statistics(new DoubleStatistic(key + ".ssq", ssq));
LOG.statistics(new DoubleStatistic(key + ".rmsd", FastMath.sqrt(ssq / div)));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("Mean distance", sum / div, 0., Double.POSITIVE_INFINITY, true);
g.addMeasure("Sum of Squares", ssq, 0., Double.POSITIVE_INFINITY, true);
g.addMeasure("RMSD", FastMath.sqrt(ssq / div), 0., Double.POSITIVE_INFINITY, true);
db.getHierarchy().add(c, ev);
return ssq;
}
use of de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic in project elki by elki-project.
the class FPGrowth method run.
/**
* Run the FP-Growth algorithm
*
* @param db Database to process
* @param relation Bit vector relation
* @return Frequent patterns found
*/
public FrequentItemsetsResult run(Database db, final Relation<BitVector> relation) {
// TODO: implement with resizable array, to not need dim.
final int dim = RelationUtil.dimensionality(relation);
final VectorFieldTypeInformation<BitVector> meta = RelationUtil.assumeVectorField(relation);
// Compute absolute minsupport
final int minsupp = getMinimumSupport(relation.size());
LOG.verbose("Finding item frequencies for ordering.");
final int[] counts = countItemSupport(relation, dim);
// Forward and backward indexes
int[] iidx = new int[dim];
final int[] idx = buildIndex(counts, iidx, minsupp);
final int items = idx.length;
LOG.statistics(new LongStatistic(STAT + "raw-items", dim));
LOG.statistics(new LongStatistic(STAT + "raw-transactions", relation.size()));
LOG.statistics(new DoubleStatistic(STAT + "minsupp-relative", minsupp / (double) relation.size()));
LOG.statistics(new LongStatistic(STAT + "minsupp-absolute", minsupp));
LOG.verbose("Building FP-Tree.");
Duration ctime = LOG.newDuration(STAT + "fp-tree.construction.time").begin();
FPTree tree = buildFPTree(relation, iidx, items);
if (LOG.isStatistics()) {
tree.logStatistics();
}
if (LOG.isDebuggingFinest()) {
StringBuilder buf = new StringBuilder(10000).append("FP-tree:\n");
tree.appendTo(buf, new FPNode.Translator() {
@Override
public StringBuilder appendTo(StringBuilder buf, int i) {
String l = meta.getLabel(idx[i]);
return (l != null) ? buf.append(l) : buf.append(i);
}
});
LOG.debugFinest(buf.toString());
}
// Reduce memory usage:
tree.reduceMemory();
LOG.statistics(ctime.end());
LOG.verbose("Extracting frequent patterns.");
Duration etime = LOG.newDuration(STAT + "fp-growth.extraction.time").begin();
final IndefiniteProgress itemp = LOG.isVerbose() ? new IndefiniteProgress("Frequent itemsets", LOG) : null;
final List<Itemset> solution = new ArrayList<>();
// Start extraction with the least frequent items
tree.extract(minsupp, minlength, maxlength, true, new FPTree.Collector() {
@Override
public void collect(int support, int[] data, int start, int plen) {
// Always translate the indexes back to the original values via 'idx'!
if (plen - start == 1) {
solution.add(new OneItemset(idx[data[start]], support));
LOG.incrementProcessed(itemp);
return;
}
// Copy from buffer to a permanent storage
int[] indices = new int[plen - start];
for (int i = start, j = 0; i < plen; i++) {
// Translate to original items
indices[j++] = idx[data[i]];
}
Arrays.sort(indices);
solution.add(new SparseItemset(indices, support));
LOG.incrementProcessed(itemp);
}
});
LOG.setCompleted(itemp);
Collections.sort(solution);
LOG.statistics(etime.end());
LOG.statistics(new LongStatistic(STAT + "frequent-itemsets", solution.size()));
return new FrequentItemsetsResult("FP-Growth", "fp-growth", solution, meta, relation.size());
}
use of de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic in project elki by elki-project.
the class NNDescent method preprocess.
@Override
protected void preprocess() {
final DBIDs ids = relation.getDBIDs();
final long starttime = System.currentTimeMillis();
IndefiniteProgress progress = LOG.isVerbose() ? new IndefiniteProgress("KNNGraph iteration", LOG) : null;
// to add query point itself in the end, internally (k-1) is used
final int internal_k = k - 1;
// kNN store
store = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, KNNHeap.class);
// store for new reverse neighbors
WritableDataStore<HashSetModifiableDBIDs> newReverseNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
// store for new reverse neighbors
WritableDataStore<HashSetModifiableDBIDs> oldReverseNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
// Sample of new forward neighbors.
WritableDataStore<HashSetModifiableDBIDs> sampleNewNeighbors = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
// data structures for new and sampled new neighbors
WritableDataStore<HashSetModifiableDBIDs> flag = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, HashSetModifiableDBIDs.class);
// Initialize data structures:
for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
store.put(iditer, DBIDUtil.newHeap(internal_k));
newReverseNeighbors.put(iditer, DBIDUtil.newHashSet());
oldReverseNeighbors.put(iditer, DBIDUtil.newHashSet());
}
// this variable is the sampling size
final int items = (int) Math.ceil(rho * internal_k);
long counter_all = 0;
// initialize neighbors (depends on -setInitialNeighbors option)
for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
// initialize sampled NN
ModifiableDBIDs sampleNew = DBIDUtil.randomSampleExcept(ids, iditer, items, rnd);
sampleNewNeighbors.put(iditer, DBIDUtil.newHashSet(sampleNew));
// initialize RNN
ModifiableDBIDs sampleRev = DBIDUtil.randomSampleExcept(ids, iditer, items, rnd);
newReverseNeighbors.put(iditer, DBIDUtil.newHashSet(sampleRev));
// initialize new neighbors
flag.put(iditer, DBIDUtil.newHashSet());
// initialize store
if (!noInitialNeighbors) {
HashSetModifiableDBIDs flags = flag.get(iditer);
for (DBIDIter siter = sampleNew.iter(); siter.valid(); siter.advance()) {
if (add(iditer, siter, distanceQuery.distance(iditer, siter))) {
flags.add(siter);
}
}
counter_all += sampleNew.size();
}
}
final int size = relation.size();
double rate = 0.0;
int iter = 0;
for (; iter < iterations; iter++) {
long counter = 0;
// iterate through dataset
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
// determine new and old neighbors
HashSetModifiableDBIDs newNeighbors = flag.get(iditer);
HashSetModifiableDBIDs oldNeighbors = DBIDUtil.newHashSet();
KNNHeap heap = store.get(iditer);
for (DoubleDBIDListIter heapiter = heap.unorderedIterator(); heapiter.valid(); heapiter.advance()) {
if (!newNeighbors.contains(heapiter)) {
oldNeighbors.add(heapiter);
}
}
// Sampling
HashSetModifiableDBIDs sampleNew = sampleNewNeighbors.get(iditer);
HashSetModifiableDBIDs newRev = newReverseNeighbors.get(iditer);
newRev.removeDBIDs(sampleNew);
boundSize(newRev, items);
HashSetModifiableDBIDs oldRev = oldReverseNeighbors.get(iditer);
oldRev.removeDBIDs(oldNeighbors);
boundSize(oldRev, items);
counter += processNewNeighbors(flag, sampleNew, oldNeighbors, newRev, oldRev);
}
counter_all += counter;
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(prefix + ".scan-rate", counter_all * .5 / (size * (size - 1L))));
}
// t is the number of new neighbors
int t = sampleNew(ids, sampleNewNeighbors, flag, items);
// calculate old and new reverse neighbors
clearAll(ids, newReverseNeighbors);
clearAll(ids, oldReverseNeighbors);
reverse(sampleNewNeighbors, newReverseNeighbors, oldReverseNeighbors);
rate = (double) t / (double) (internal_k * size);
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(prefix + ".update-rate", rate));
}
if (counter < delta * internal_k * size) {
LOG.verbose("KNNGraph terminated because we performaned delta*k*size distance computations.");
break;
}
if (rate < delta) {
LOG.verbose("KNNGraph terminated because update rate got smaller than delta.");
break;
}
LOG.incrementProcessed(progress);
}
if (LOG.isVerbose() && iter == iterations) {
LOG.verbose("KNNGraph terminated because the maximum number of iterations was reached.");
}
LOG.setCompleted(progress);
// convert store to storage
storage = DataStoreFactory.FACTORY.makeStorage(ids, DataStoreFactory.HINT_DB, KNNList.class);
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
KNNHeap tempHeap = DBIDUtil.newHeap(k);
// Add query point and convert heap to list:
KNNHeap heap = store.get(iditer);
tempHeap.insert(0, iditer);
for (DoubleDBIDListIter heapiter = heap.unorderedIterator(); heapiter.valid(); heapiter.advance()) {
tempHeap.insert(heapiter.doubleValue(), heapiter);
}
storage.put(iditer, tempHeap.toKNNList());
}
final long end = System.currentTimeMillis();
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(prefix + ".construction-time.ms", end - starttime));
}
}
Aggregations