use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.
the class CTLuZTestOutlier method run.
/**
* Main method.
*
* @param database Database
* @param nrel Neighborhood relation
* @param relation Data relation (1d!)
* @return Outlier detection result
*/
public OutlierResult run(Database database, Relation<N> nrel, Relation<? extends NumberVector> relation) {
final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(database, nrel);
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
MeanVariance zmv = new MeanVariance();
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
DBIDs neighbors = npred.getNeighborDBIDs(iditer);
// Compute Mean of neighborhood
Mean localmean = new Mean();
for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) {
if (DBIDUtil.equal(iditer, iter)) {
continue;
}
localmean.put(relation.get(iter).doubleValue(0));
}
final double localdiff;
if (localmean.getCount() > 0) {
localdiff = relation.get(iditer).doubleValue(0) - localmean.getMean();
} else {
localdiff = 0.0;
}
scores.putDouble(iditer, localdiff);
zmv.put(localdiff);
}
// Normalize scores using mean and variance
DoubleMinMax minmax = new DoubleMinMax();
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double score = Math.abs(scores.doubleValue(iditer) - zmv.getMean()) / zmv.getSampleStddev();
minmax.put(score);
scores.putDouble(iditer, score);
}
// Wrap result
DoubleRelation scoreResult = new MaterializedDoubleRelation("ZTest", "Z Test score", scores, relation.getDBIDs());
OutlierScoreMeta scoreMeta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0);
OutlierResult or = new OutlierResult(scoreMeta, scoreResult);
or.addChildResult(npred);
return or;
}
use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.
the class SpatialApproximationMaterializeKNNPreprocessor method preprocess.
@Override
protected void preprocess() {
DistanceQuery<O> distanceQuery = relation.getDistanceQuery(distanceFunction);
SpatialIndexTree<N, E> index = getSpatialIndex(relation);
storage = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC, KNNList.class);
MeanVariance pagesize = new MeanVariance();
MeanVariance ksize = new MeanVariance();
final Logging log = getLogger();
if (log.isVerbose()) {
log.verbose("Approximating nearest neighbor lists to database objects");
}
List<E> leaves = index.getLeaves();
FiniteProgress progress = log.isVerbose() ? new FiniteProgress("Processing leaf nodes", leaves.size(), log) : null;
for (E leaf : leaves) {
N node = index.getNode(leaf);
int size = node.getNumEntries();
pagesize.put(size);
if (log.isDebuggingFinest()) {
log.debugFinest("NumEntires = " + size);
}
// Collect the ids in this node.
ArrayModifiableDBIDs ids = DBIDUtil.newArray(size);
for (int i = 0; i < size; i++) {
ids.add(((LeafEntry) node.getEntry(i)).getDBID());
}
Object2DoubleOpenHashMap<DBIDPair> cache = new Object2DoubleOpenHashMap<>((size * size * 3) >> 3);
for (DBIDIter id = ids.iter(); id.valid(); id.advance()) {
KNNHeap kNN = DBIDUtil.newHeap(k);
for (DBIDIter id2 = ids.iter(); id2.valid(); id2.advance()) {
DBIDPair key = DBIDUtil.newPair(id, id2);
double d = cache.removeDouble(key);
if (d == d) {
// Not NaN
// consume the previous result.
kNN.insert(d, id2);
} else {
// compute new and store the previous result.
d = distanceQuery.distance(id, id2);
kNN.insert(d, id2);
// put it into the cache, but with the keys reversed
key = DBIDUtil.newPair(id2, id);
cache.put(key, d);
}
}
ksize.put(kNN.size());
storage.put(id, kNN.toKNNList());
}
if (log.isDebugging() && cache.size() > 0) {
log.warning("Cache should be empty after each run, but still has " + cache.size() + " elements.");
}
log.incrementProcessed(progress);
}
log.ensureCompleted(progress);
if (log.isVerbose()) {
log.verbose("Average page size = " + pagesize.getMean() + " +- " + pagesize.getSampleStddev());
log.verbose("On average, " + ksize.getMean() + " +- " + ksize.getSampleStddev() + " neighbors returned.");
}
}
use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.
the class PerplexityAffinityMatrixBuilder method computePij.
/**
* Compute the pij from the distance matrix.
*
* @param dist Distance matrix.
* @param perplexity Desired perplexity
* @param initialScale Initial scale
* @return Affinity matrix pij
*/
protected static double[][] computePij(double[][] dist, double perplexity, double initialScale) {
final int size = dist.length;
final double logPerp = FastMath.log(perplexity);
double[][] pij = new double[size][size];
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Optimizing perplexities", size, LOG) : null;
Duration timer = LOG.isStatistics() ? LOG.newDuration(PerplexityAffinityMatrixBuilder.class.getName() + ".runtime.pijmatrix").begin() : null;
MeanVariance mv = LOG.isStatistics() ? new MeanVariance() : null;
for (int i = 0; i < size; i++) {
double beta = computePi(i, dist[i], pij[i], perplexity, logPerp);
if (mv != null) {
// Sigma
mv.put(beta > 0 ? FastMath.sqrt(.5 / beta) : 0.);
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
if (LOG.isStatistics()) {
// timer != null, mv != null
LOG.statistics(timer.end());
LOG.statistics(new DoubleStatistic(PerplexityAffinityMatrixBuilder.class.getName() + ".sigma.average", mv.getMean()));
LOG.statistics(new DoubleStatistic(PerplexityAffinityMatrixBuilder.class.getName() + ".sigma.stddev", mv.getSampleStddev()));
}
// Scale pij to have the desired sum EARLY_EXAGGERATION
double sum = 0.;
for (int i = 1; i < size; i++) {
final double[] pij_i = pij[i];
for (int j = 0; j < i; j++) {
// Nur über halbe Matrix!
// Symmetrie herstellen
sum += (pij_i[j] += pij[j][i]);
}
}
// Scaling taken from original tSNE code:
final double scale = initialScale / (2. * sum);
for (int i = 1; i < size; i++) {
final double[] pij_i = pij[i];
for (int j = 0; j < i; j++) {
pij_i[j] = pij[j][i] = MathUtil.max(pij_i[j] * scale, MIN_PIJ);
}
}
return pij;
}
use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.
the class DistanceStatisticsWithClasses method run.
@Override
public HistogramResult run(Database database) {
final Relation<O> relation = database.getRelation(getInputTypeRestriction()[0]);
final DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
final StepProgress stepprog = LOG.isVerbose() ? new StepProgress("Distance statistics", 2) : null;
// determine binning ranges.
DoubleMinMax gminmax = new DoubleMinMax();
// Cluster by labels
Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters();
// global in-cluster min/max
DoubleMinMax giminmax = new DoubleMinMax();
// global other-cluster min/max
DoubleMinMax gominmax = new DoubleMinMax();
// in-cluster distances
MeanVariance mimin = new MeanVariance();
MeanVariance mimax = new MeanVariance();
MeanVariance midif = new MeanVariance();
// other-cluster distances
MeanVariance momin = new MeanVariance();
MeanVariance momax = new MeanVariance();
MeanVariance modif = new MeanVariance();
// Histogram
final ObjHistogram<long[]> histogram;
LOG.beginStep(stepprog, 1, "Prepare histogram.");
if (exact) {
gminmax = exactMinMax(relation, distFunc);
histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2);
} else if (sampling) {
gminmax = sampleMinMax(relation, distFunc);
histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2);
} else {
histogram = new AbstractObjDynamicHistogram<long[]>(numbin) {
@Override
protected long[] downsample(Object[] data, int start, int end, int size) {
long[] ret = new long[2];
for (int i = start; i < end; i++) {
long[] existing = (long[]) data[i];
if (existing != null) {
for (int c = 0; c < 2; c++) {
ret[c] += existing[c];
}
}
}
return ret;
}
@Override
protected long[] aggregate(long[] first, long[] second) {
for (int c = 0; c < 2; c++) {
first[c] += second[c];
}
return first;
}
@Override
protected long[] cloneForCache(long[] data) {
return data.clone();
}
@Override
protected long[] makeObject() {
return new long[2];
}
};
}
LOG.beginStep(stepprog, 2, "Build histogram.");
final FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Distance computations", relation.size(), LOG) : null;
// iterate per cluster
final long[] incFirst = new long[] { 1L, 0L };
final long[] incSecond = new long[] { 0L, 1L };
for (Cluster<?> c1 : split) {
for (DBIDIter id1 = c1.getIDs().iter(); id1.valid(); id1.advance()) {
// in-cluster distances
DoubleMinMax iminmax = new DoubleMinMax();
for (DBIDIter iter2 = c1.getIDs().iter(); iter2.valid(); iter2.advance()) {
// skip the point itself.
if (DBIDUtil.equal(id1, iter2)) {
continue;
}
double d = distFunc.distance(id1, iter2);
histogram.putData(d, incFirst);
iminmax.put(d);
}
// aggregate
mimin.put(iminmax.getMin());
mimax.put(iminmax.getMax());
midif.put(iminmax.getDiff());
// min/max
giminmax.put(iminmax.getMin());
giminmax.put(iminmax.getMax());
// other-cluster distances
DoubleMinMax ominmax = new DoubleMinMax();
for (Cluster<?> c2 : split) {
if (c2 == c1) {
continue;
}
for (DBIDIter iter2 = c2.getIDs().iter(); iter2.valid(); iter2.advance()) {
// skip the point itself (shouldn't happen though)
if (DBIDUtil.equal(id1, iter2)) {
continue;
}
double d = distFunc.distance(id1, iter2);
histogram.putData(d, incSecond);
ominmax.put(d);
}
}
// aggregate
momin.put(ominmax.getMin());
momax.put(ominmax.getMax());
modif.put(ominmax.getDiff());
// min/max
gominmax.put(ominmax.getMin());
gominmax.put(ominmax.getMax());
LOG.incrementProcessed(progress);
}
}
LOG.ensureCompleted(progress);
// Update values (only needed for sampling case).
gminmax.put(gominmax);
LOG.setCompleted(stepprog);
// count the number of samples we have in the data
long inum = 0;
long onum = 0;
for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
inum += iter.getValue()[0];
onum += iter.getValue()[1];
}
long bnum = inum + onum;
Collection<double[]> binstat = new ArrayList<>(numbin);
for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
final long[] value = iter.getValue();
final double icof = (inum == 0) ? 0 : ((double) value[0]) / inum / histogram.getBinsize();
final double icaf = ((double) value[0]) / bnum / histogram.getBinsize();
final double ocof = (onum == 0) ? 0 : ((double) value[1]) / onum / histogram.getBinsize();
final double ocaf = ((double) value[1]) / bnum / histogram.getBinsize();
binstat.add(new double[] { iter.getCenter(), icof, icaf, ocof, ocaf });
}
HistogramResult result = new HistogramResult("Distance Histogram", "distance-histogram", binstat);
result.addHeader("Absolute minimum distance (abs): " + gminmax.getMin());
result.addHeader("Absolute maximum distance (abs): " + gminmax.getMax());
result.addHeader("In-Cluster minimum distance (abs, avg, stddev): " + giminmax.getMin() + " " + mimin.getMean() + " " + mimin.getSampleStddev());
result.addHeader("In-Cluster maximum distance (abs, avg, stddev): " + giminmax.getMax() + " " + mimax.getMean() + " " + mimax.getSampleStddev());
result.addHeader("Other-Cluster minimum distance (abs, avg, stddev): " + gominmax.getMin() + " " + momin.getMean() + " " + momin.getSampleStddev());
result.addHeader("Other-Cluster maximum distance (abs, avg, stddev): " + gominmax.getMax() + " " + momax.getMean() + " " + momax.getSampleStddev());
result.addHeader("Column description: bin center, in-cluster only frequency, in-cluster all frequency, other-cluster only frequency, other cluster all frequency");
result.addHeader("In-cluster value count: " + inum + " other cluster value count: " + onum);
return result;
}
use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.
the class KNNBenchmarkAlgorithm method run.
/**
* Run the algorithm.
*
* @param database Database
* @param relation Relation
* @return Null result
*/
public Result run(Database database, Relation<O> relation) {
// Get a distance and kNN query instance.
DistanceQuery<O> distQuery = database.getDistanceQuery(relation, getDistanceFunction());
KNNQuery<O> knnQuery = database.getKNNQuery(distQuery, k);
// No query set - use original database.
if (queries == null) {
final DBIDs sample = DBIDUtil.randomSample(relation.getDBIDs(), sampling, random);
FiniteProgress prog = LOG.isVeryVerbose() ? new FiniteProgress("kNN queries", sample.size(), LOG) : null;
int hash = 0;
MeanVariance mv = new MeanVariance(), mvdist = new MeanVariance();
for (DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) {
KNNList knns = knnQuery.getKNNForDBID(iditer, k);
int ichecksum = 0;
for (DBIDIter it = knns.iter(); it.valid(); it.advance()) {
ichecksum += DBIDUtil.asInteger(it);
}
hash = Util.mixHashCodes(hash, ichecksum);
mv.put(knns.size());
mvdist.put(knns.getKNNDistance());
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics("Result hashcode: " + hash);
LOG.statistics("Mean number of results: " + mv.getMean() + " +- " + mv.getNaiveStddev());
if (mvdist.getCount() > 0) {
LOG.statistics("Mean k-distance: " + mvdist.getMean() + " +- " + mvdist.getNaiveStddev());
}
}
} else {
// Separate query set.
TypeInformation res = getDistanceFunction().getInputTypeRestriction();
MultipleObjectsBundle bundle = queries.loadData();
int col = -1;
for (int i = 0; i < bundle.metaLength(); i++) {
if (res.isAssignableFromType(bundle.meta(i))) {
col = i;
break;
}
}
if (col < 0) {
throw new IncompatibleDataException("No compatible data type in query input was found. Expected: " + res.toString());
}
// Random sampling is a bit of hack, sorry.
// But currently, we don't (yet) have an "integer random sample" function.
DBIDRange sids = DBIDUtil.generateStaticDBIDRange(bundle.dataLength());
final DBIDs sample = DBIDUtil.randomSample(sids, sampling, random);
FiniteProgress prog = LOG.isVeryVerbose() ? new FiniteProgress("kNN queries", sample.size(), LOG) : null;
int hash = 0;
MeanVariance mv = new MeanVariance(), mvdist = new MeanVariance();
for (DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) {
int off = sids.binarySearch(iditer);
assert (off >= 0);
@SuppressWarnings("unchecked") O o = (O) bundle.data(off, col);
KNNList knns = knnQuery.getKNNForObject(o, k);
int ichecksum = 0;
for (DBIDIter it = knns.iter(); it.valid(); it.advance()) {
ichecksum += DBIDUtil.asInteger(it);
}
hash = Util.mixHashCodes(hash, ichecksum);
mv.put(knns.size());
mvdist.put(knns.getKNNDistance());
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics("Result hashcode: " + hash);
LOG.statistics("Mean number of results: " + mv.getMean() + " +- " + mv.getNaiveStddev());
if (mvdist.getCount() > 0) {
LOG.statistics("Mean k-distance: " + mvdist.getMean() + " +- " + mvdist.getNaiveStddev());
}
}
}
return null;
}
Aggregations