use of de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMaxHeap in project elki by elki-project.
the class DistanceQuantileSampler method run.
/**
* Run the distance quantile sampler.
*
* @param database
* @param rel
* @return Distances sample
*/
public CollectionResult<double[]> run(Database database, Relation<O> rel) {
DistanceQuery<O> dq = rel.getDistanceQuery(getDistanceFunction());
int size = rel.size();
long pairs = (size * (long) size) >> 1;
final long ssize = sampling <= 1 ? (long) Math.ceil(sampling * pairs) : (long) sampling;
if (ssize > Integer.MAX_VALUE) {
throw new AbortException("Sampling size too large.");
}
final int qsize = quantile <= 0 ? 1 : (int) Math.ceil(quantile * ssize);
DoubleMaxHeap heap = new DoubleMaxHeap(qsize);
ArrayDBIDs ids = DBIDUtil.ensureArray(rel.getDBIDs());
DBIDArrayIter i1 = ids.iter(), i2 = ids.iter();
Random r = rand.getSingleThreadedRandom();
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Sampling", (int) ssize, LOG) : null;
for (long i = 0; i < ssize; i++) {
int x = r.nextInt(size - 1) + 1, y = r.nextInt(x);
double dist = dq.distance(i1.seek(x), i2.seek(y));
// Skip NaN, and/or zeros.
if (dist != dist || (nozeros && dist < Double.MIN_NORMAL)) {
continue;
}
heap.add(dist, qsize);
LOG.incrementProcessed(prog);
}
LOG.statistics(new DoubleStatistic(PREFIX + ".quantile", quantile));
LOG.statistics(new LongStatistic(PREFIX + ".samplesize", ssize));
LOG.statistics(new DoubleStatistic(PREFIX + ".distance", heap.peek()));
LOG.ensureCompleted(prog);
Collection<String> header = Arrays.asList(new String[] { "Distance" });
Collection<double[]> data = Arrays.asList(new double[][] { new double[] { heap.peek() } });
return new CollectionResult<double[]>("Distances sample", "distance-sample", data, header);
}
use of de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMaxHeap in project elki by elki-project.
the class EvaluateCIndex method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param c Clustering
* @return C-Index
*/
public double evaluateClustering(Database db, Relation<? extends O> rel, DistanceQuery<O> dq, Clustering<?> c) {
List<? extends Cluster<?>> clusters = c.getAllClusters();
// Count ignored noise, and within-cluster distances
int ignorednoise = 0, w = 0;
for (Cluster<?> cluster : clusters) {
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
ignorednoise += cluster.size();
// Ignore
continue;
case TREAT_NOISE_AS_SINGLETONS:
// No within-cluster distances!
continue;
case MERGE_NOISE:
// Treat like a cluster
break;
default:
LOG.warning("Unknown noise handling option: " + noiseOption);
}
}
w += (cluster.size() * (cluster.size() - 1)) >>> 1;
}
// TODO: for small k=2, and balanced clusters, it may be more efficient to
// just build a long array with all distances, and select the quantiles.
// The heaps used below pay off in memory consumption for k > 2
// Yes, maxDists is supposed to be a min heap, and the other way.
// Because we want to replace the smallest of the current k-largest
// distances.
DoubleHeap maxDists = new DoubleMinHeap(w);
DoubleHeap minDists = new DoubleMaxHeap(w);
// Sum of within-cluster distances
double theta = 0.;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Processing clusters for C-Index", clusters.size(), LOG) : null;
for (int i = 0; i < clusters.size(); i++) {
Cluster<?> cluster = clusters.get(i);
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
LOG.incrementProcessed(prog);
// Ignore
continue;
case TREAT_NOISE_AS_SINGLETONS:
processSingleton(cluster, rel, dq, maxDists, minDists, w);
LOG.incrementProcessed(prog);
continue;
case MERGE_NOISE:
// Treat like a cluster, below
break;
}
}
theta += processCluster(cluster, clusters, i, dq, maxDists, minDists, w);
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
// Simulate best and worst cases:
// Sum of largest and smallest
double min = 0, max = 0;
assert (minDists.size() == w);
assert (maxDists.size() == w);
for (DoubleHeap.UnsortedIter it = minDists.unsortedIter(); it.valid(); it.advance()) {
min += it.get();
}
for (DoubleHeap.UnsortedIter it = maxDists.unsortedIter(); it.valid(); it.advance()) {
max += it.get();
}
assert (max >= min);
double cIndex = (max > min) ? (theta - min) / (max - min) : 1.;
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(key + ".c-index.noise-handling", noiseOption.toString()));
if (ignorednoise > 0) {
LOG.statistics(new LongStatistic(key + ".c-index.ignored", ignorednoise));
}
LOG.statistics(new DoubleStatistic(key + ".c-index", cIndex));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("C-Index", cIndex, 0., 1., 0., true);
db.getHierarchy().resultChanged(ev);
return cIndex;
}
Aggregations