use of de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap in project elki by elki-project.
the class LBABOD method run.
/**
* Run LB-ABOD on the data set.
*
* @param relation Relation to process
* @return Outlier detection result
*/
@Override
public OutlierResult run(Database db, Relation<V> relation) {
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
DBIDArrayIter pB = ids.iter(), pC = ids.iter();
SimilarityQuery<V> sq = db.getSimilarityQuery(relation, kernelFunction);
KernelMatrix kernelMatrix = new KernelMatrix(sq, relation, ids);
// Output storage.
WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
DoubleMinMax minmaxabod = new DoubleMinMax();
double max = 0.;
// Storage for squared distances (will be reused!)
WritableDoubleDataStore sqDists = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT);
// Nearest neighbor heap (will be reused!)
KNNHeap nn = DBIDUtil.newHeap(k);
// Priority queue for candidates
ModifiableDoubleDBIDList candidates = DBIDUtil.newDistanceDBIDList(relation.size());
// get Candidate Ranking
for (DBIDIter pA = relation.iterDBIDs(); pA.valid(); pA.advance()) {
// Compute nearest neighbors and distances.
nn.clear();
double simAA = kernelMatrix.getSimilarity(pA, pA);
// Sum of 1./(|AB|) and 1./(|AB|^2); for computing R2.
double sumid = 0., sumisqd = 0.;
for (pB.seek(0); pB.valid(); pB.advance()) {
if (DBIDUtil.equal(pB, pA)) {
continue;
}
double simBB = kernelMatrix.getSimilarity(pB, pB);
double simAB = kernelMatrix.getSimilarity(pA, pB);
double sqdAB = simAA + simBB - simAB - simAB;
sqDists.putDouble(pB, sqdAB);
final double isqdAB = 1. / sqdAB;
sumid += FastMath.sqrt(isqdAB);
sumisqd += isqdAB;
// Update heap
nn.insert(sqdAB, pB);
}
// Compute FastABOD approximation, adjust for lower bound.
// LB-ABOF is defined via a numerically unstable formula.
// Variance as E(X^2)-E(X)^2 suffers from catastrophic cancellation!
// TODO: ensure numerical precision!
double nnsum = 0., nnsumsq = 0., nnsumisqd = 0.;
KNNList nl = nn.toKNNList();
DoubleDBIDListIter iB = nl.iter(), iC = nl.iter();
for (; iB.valid(); iB.advance()) {
double sqdAB = iB.doubleValue();
double simAB = kernelMatrix.getSimilarity(pA, iB);
if (!(sqdAB > 0.)) {
continue;
}
for (iC.seek(iB.getOffset() + 1); iC.valid(); iC.advance()) {
double sqdAC = iC.doubleValue();
double simAC = kernelMatrix.getSimilarity(pA, iC);
if (!(sqdAC > 0.)) {
continue;
}
// Exploit bilinearity of scalar product:
// <B-A, C-A> = <B, C-A> - <A,C-A>
// = <B,C> - <B,A> - <A,C> + <A,A>
double simBC = kernelMatrix.getSimilarity(iB, iC);
double numerator = simBC - simAB - simAC + simAA;
double sqweight = 1. / (sqdAB * sqdAC);
double weight = FastMath.sqrt(sqweight);
double val = numerator * sqweight;
nnsum += val * weight;
nnsumsq += val * val * weight;
nnsumisqd += sqweight;
}
}
// Remaining weight, term R2:
double r2 = sumisqd * sumisqd - 2. * nnsumisqd;
double tmp = (2. * nnsum + r2) / (sumid * sumid);
double lbabof = 2. * nnsumsq / (sumid * sumid) - tmp * tmp;
// Track maximum?
if (lbabof > max) {
max = lbabof;
}
abodvalues.putDouble(pA, lbabof);
candidates.add(lbabof, pA);
}
// Put maximum from approximate values.
minmaxabod.put(max);
candidates.sort();
// refine Candidates
int refinements = 0;
DoubleMinHeap topscores = new DoubleMinHeap(l);
MeanVariance s = new MeanVariance();
for (DoubleDBIDListIter pA = candidates.iter(); pA.valid(); pA.advance()) {
// Stop refining
if (topscores.size() >= k && pA.doubleValue() > topscores.peek()) {
break;
}
final double abof = computeABOF(kernelMatrix, pA, pB, pC, s);
// Store refined score:
abodvalues.putDouble(pA, abof);
minmaxabod.put(abof);
// Update the heap tracking the top scores.
if (topscores.size() < k) {
topscores.add(abof);
} else {
if (topscores.peek() > abof) {
topscores.replaceTopElement(abof);
}
}
refinements += 1;
}
if (LOG.isStatistics()) {
LoggingConfiguration.setVerbose(Level.VERYVERBOSE);
LOG.statistics(new LongStatistic("lb-abod.refinements", refinements));
}
// Build result representation.
DoubleRelation scoreResult = new MaterializedDoubleRelation("Angle-based Outlier Detection", "abod-outlier", abodvalues, ids);
OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY);
return new OutlierResult(scoreMeta, scoreResult);
}
use of de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap in project elki by elki-project.
the class SURFINGDependenceMeasure method dependence.
@//
Reference(//
authors = "Christian Baumgartner, Claudia Plant, Karin Kailing, Hans-Peter Kriegel, and Peer Kröger", //
title = "Subspace Selection for Clustering High-Dimensional Data", //
booktitle = "IEEE International Conference on Data Mining, 2004", url = "http://dx.doi.org/10.1109/ICDM.2004.10112")
@Override
public <A, B> double dependence(NumberArrayAdapter<?, A> adapter1, A data1, NumberArrayAdapter<?, B> adapter2, B data2) {
final int len = size(adapter1, data1, adapter2, data2);
final int k = Math.max(1, len / 10);
double[] knns = new double[len];
DoubleMinHeap heap = new DoubleMinHeap(k);
double kdistmean = 0.;
for (int i = 0; i < len; ++i) {
double ix = adapter1.getDouble(data1, i), iy = adapter2.getDouble(data2, i);
heap.clear();
for (int j = 0; j < len; ++j) {
double jx = adapter1.getDouble(data1, j), jy = adapter2.getDouble(data2, j);
double dx = ix - jx, dy = iy - jy;
// Squared Euclidean.
heap.add(dx * dx + dy * dy);
}
// Euclidean
double kdist = FastMath.sqrt(heap.peek());
knns[i] = kdist;
kdistmean += kdist;
}
kdistmean /= len;
// Deviation from mean:
double diff = 0.;
int below = 0;
for (int l = 0; l < knns.length; l++) {
diff += Math.abs(kdistmean - knns[l]);
if (knns[l] < kdistmean) {
below++;
}
}
return (below > 0) ? diff / (2. * kdistmean * below) : 0;
}
use of de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap in project elki by elki-project.
the class KMeansMinusMinus method run.
@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
// Intialisieren der means
double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
// initialisieren vom Heap
final int heapsize = (int) (rate < 1. ? Math.ceil(relation.size() * rate) : rate);
DoubleMinHeap minHeap = new DoubleMinHeap(heapsize);
// Setup cluster assignment store
List<ModifiableDoubleDBIDList> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
}
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
double[] varsum = new double[k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
DoubleStatistic varstat = new DoubleStatistic(this.getClass().getName() + ".variance-sum");
// Otherwise, the vartotal break below will fail!
assert (varstat != null);
int iteration = 0;
double prevvartotal = Double.POSITIVE_INFINITY;
for (; maxiter <= 0 || iteration < maxiter; iteration++) {
minHeap.clear();
for (int i = 0; i < k; i++) {
clusters.get(i).clear();
}
LOG.incrementProcessed(prog);
boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum, minHeap, heapsize);
double vartotal = logVarstat(varstat, varsum);
// than the previous value.
if (!changed || vartotal > prevvartotal) {
break;
}
prevvartotal = vartotal;
// Recompute means.
means = meansWithTreshhold(clusters, means, relation, heapsize > 0 ? minHeap.peek() : Double.POSITIVE_INFINITY);
}
// create noisecluster if wanted
ModifiableDoubleDBIDList noiseids = null;
if (noiseFlag && heapsize > 0) {
clusters.add(noiseids = DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
double tresh = minHeap.peek();
for (int i = 0; i < k; i++) {
for (DoubleDBIDListMIter it = clusters.get(i).iter(); it.valid(); it.advance()) {
final double dist = it.doubleValue();
// Add to the noise cluster:
if (dist >= tresh) {
noiseids.add(dist, it);
assignment.putInt(it, k);
it.remove();
}
}
}
}
LOG.setCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
}
// Wrap result
Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
for (int i = 0; i < k; i++) {
DBIDs ids = clusters.get(i);
if (ids.size() == 0) {
continue;
}
KMeansModel model = new KMeansModel(means[i], varsum[i]);
result.addToplevelCluster(new Cluster<>(ids, model));
}
// Noise Cluster
if (noiseFlag) {
KMeansModel model = new KMeansModel(null, 0);
DBIDs ids = noiseids;
if (ids.size() == 0) {
return result;
}
result.addToplevelCluster(new Cluster<>(ids, true, model));
}
return result;
}
use of de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap in project elki by elki-project.
the class EvaluateCIndex method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param c Clustering
* @return C-Index
*/
public double evaluateClustering(Database db, Relation<? extends O> rel, DistanceQuery<O> dq, Clustering<?> c) {
List<? extends Cluster<?>> clusters = c.getAllClusters();
// Count ignored noise, and within-cluster distances
int ignorednoise = 0, w = 0;
for (Cluster<?> cluster : clusters) {
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
ignorednoise += cluster.size();
// Ignore
continue;
case TREAT_NOISE_AS_SINGLETONS:
// No within-cluster distances!
continue;
case MERGE_NOISE:
// Treat like a cluster
break;
default:
LOG.warning("Unknown noise handling option: " + noiseOption);
}
}
w += (cluster.size() * (cluster.size() - 1)) >>> 1;
}
// TODO: for small k=2, and balanced clusters, it may be more efficient to
// just build a long array with all distances, and select the quantiles.
// The heaps used below pay off in memory consumption for k > 2
// Yes, maxDists is supposed to be a min heap, and the other way.
// Because we want to replace the smallest of the current k-largest
// distances.
DoubleHeap maxDists = new DoubleMinHeap(w);
DoubleHeap minDists = new DoubleMaxHeap(w);
// Sum of within-cluster distances
double theta = 0.;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Processing clusters for C-Index", clusters.size(), LOG) : null;
for (int i = 0; i < clusters.size(); i++) {
Cluster<?> cluster = clusters.get(i);
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
LOG.incrementProcessed(prog);
// Ignore
continue;
case TREAT_NOISE_AS_SINGLETONS:
processSingleton(cluster, rel, dq, maxDists, minDists, w);
LOG.incrementProcessed(prog);
continue;
case MERGE_NOISE:
// Treat like a cluster, below
break;
}
}
theta += processCluster(cluster, clusters, i, dq, maxDists, minDists, w);
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
// Simulate best and worst cases:
// Sum of largest and smallest
double min = 0, max = 0;
assert (minDists.size() == w);
assert (maxDists.size() == w);
for (DoubleHeap.UnsortedIter it = minDists.unsortedIter(); it.valid(); it.advance()) {
min += it.get();
}
for (DoubleHeap.UnsortedIter it = maxDists.unsortedIter(); it.valid(); it.advance()) {
max += it.get();
}
assert (max >= min);
double cIndex = (max > min) ? (theta - min) / (max - min) : 1.;
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(key + ".c-index.noise-handling", noiseOption.toString()));
if (ignorednoise > 0) {
LOG.statistics(new LongStatistic(key + ".c-index.ignored", ignorednoise));
}
LOG.statistics(new DoubleStatistic(key + ".c-index", cIndex));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("C-Index", cIndex, 0., 1., 0., true);
db.getHierarchy().resultChanged(ev);
return cIndex;
}
Aggregations