use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.
the class CTLuMoranScatterplotOutlier method run.
/**
* Main method.
*
* @param database Database
* @param nrel Neighborhood relation
* @param relation Data relation (1d!)
* @return Outlier detection result
*/
public OutlierResult run(Database database, Relation<N> nrel, Relation<? extends NumberVector> relation) {
final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(database, nrel);
// Compute the global mean and variance
MeanVariance globalmv = new MeanVariance();
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
globalmv.put(relation.get(iditer).doubleValue(0));
}
DoubleMinMax minmax = new DoubleMinMax();
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
// calculate neighborhood average of normalized attribute values.
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
// Compute global z score
final double globalZ = (relation.get(iditer).doubleValue(0) - globalmv.getMean()) / globalmv.getNaiveStddev();
// Compute local average z score
Mean localm = new Mean();
for (DBIDIter iter = npred.getNeighborDBIDs(iditer).iter(); iter.valid(); iter.advance()) {
if (DBIDUtil.equal(iditer, iter)) {
continue;
}
localm.put((relation.get(iter).doubleValue(0) - globalmv.getMean()) / globalmv.getNaiveStddev());
}
// if neighors.size == 0
final double localZ;
if (localm.getCount() > 0) {
localZ = localm.getMean();
} else {
// if s has no neighbors => Wzi = zi
localZ = globalZ;
}
// compute score
// Note: in the original moran scatterplot, any object with a score < 0 would be an outlier.
final double score = Math.max(-globalZ * localZ, 0);
minmax.put(score);
scores.putDouble(iditer, score);
}
DoubleRelation scoreResult = new MaterializedDoubleRelation("MoranOutlier", "Moran Scatterplot Outlier", scores, relation.getDBIDs());
OutlierScoreMeta scoreMeta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax(), Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, 0);
OutlierResult or = new OutlierResult(scoreMeta, scoreResult);
or.addChildResult(npred);
return or;
}
use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.
the class FastABOD method run.
/**
* Run Fast-ABOD on the data set.
*
* @param relation Relation to process
* @return Outlier detection result
*/
@Override
public OutlierResult run(Database db, Relation<V> relation) {
DBIDs ids = relation.getDBIDs();
// Build a kernel matrix, to make O(n^3) slightly less bad.
SimilarityQuery<V> sq = db.getSimilarityQuery(relation, kernelFunction);
KernelMatrix kernelMatrix = new KernelMatrix(sq, relation, ids);
WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
DoubleMinMax minmaxabod = new DoubleMinMax();
MeanVariance s = new MeanVariance();
KNNHeap nn = DBIDUtil.newHeap(k);
for (DBIDIter pA = ids.iter(); pA.valid(); pA.advance()) {
final double simAA = kernelMatrix.getSimilarity(pA, pA);
// Choose the k-min nearest
nn.clear();
for (DBIDIter nB = relation.iterDBIDs(); nB.valid(); nB.advance()) {
if (DBIDUtil.equal(nB, pA)) {
continue;
}
double simBB = kernelMatrix.getSimilarity(nB, nB);
double simAB = kernelMatrix.getSimilarity(pA, nB);
double sqdAB = simAA + simBB - simAB - simAB;
if (!(sqdAB > 0.)) {
continue;
}
nn.insert(sqdAB, nB);
}
KNNList nl = nn.toKNNList();
s.reset();
DoubleDBIDListIter iB = nl.iter(), iC = nl.iter();
for (; iB.valid(); iB.advance()) {
double sqdAB = iB.doubleValue();
double simAB = kernelMatrix.getSimilarity(pA, iB);
if (!(sqdAB > 0.)) {
continue;
}
for (iC.seek(iB.getOffset() + 1); iC.valid(); iC.advance()) {
double sqdAC = iC.doubleValue();
double simAC = kernelMatrix.getSimilarity(pA, iC);
if (!(sqdAC > 0.)) {
continue;
}
// Exploit bilinearity of scalar product:
// <B-A, C-A> = <B, C-A> - <A,C-A>
// = <B,C> - <B,A> - <A,C> + <A,A>
double simBC = kernelMatrix.getSimilarity(iB, iC);
double numerator = simBC - simAB - simAC + simAA;
double div = 1. / (sqdAB * sqdAC);
s.put(numerator * div, FastMath.sqrt(div));
}
}
// Sample variance probably would probably be better, but the ABOD
// publication uses the naive variance.
final double abof = s.getNaiveVariance();
minmaxabod.put(abof);
abodvalues.putDouble(pA, abof);
}
// Build result representation.
DoubleRelation scoreResult = new MaterializedDoubleRelation("Angle-Based Outlier Degree", "abod-outlier", abodvalues, relation.getDBIDs());
OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY);
return new OutlierResult(scoreMeta, scoreResult);
}
use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.
the class P3C method chiSquaredUniformTest.
/**
* Performs a ChiSquared test to determine whether an attribute has a uniform
* distribution.
*
* @param parts Data partitions.
* @param marked the marked bins that should be ignored.
* @param card Cardinality
* @return Position of maximum, or -1 when uniform.
*/
private int chiSquaredUniformTest(SetDBIDs[] parts, long[] marked, int card) {
// Get global mean over all unmarked bins.
int max = 0, maxpos = -1;
MeanVariance mv = new MeanVariance();
for (int i = 0; i < parts.length; i++) {
// Ignore already marked bins.
if (BitsUtil.get(marked, i)) {
continue;
}
final int binSupport = parts[i].size();
mv.put(binSupport);
if (binSupport > max) {
max = binSupport;
maxpos = i;
}
}
if (mv.getCount() < 1. || !(mv.getNaiveVariance() > 0.)) {
return -1;
}
// ChiSquare statistic is the naive variance of the sizes!
final double chiSquare = mv.getNaiveVariance() / mv.getMean();
final int binCount = parts.length - card;
final double test = ChiSquaredDistribution.cdf(chiSquare, Math.max(1, binCount - card - 1));
return ((1. - alpha) < test) ? maxpos : -1;
}
use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.
the class EvaluateSilhouette method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param dq Distance query
* @param c Clustering
* @return Average silhouette
*/
public double evaluateClustering(Database db, Relation<O> rel, DistanceQuery<O> dq, Clustering<?> c) {
List<? extends Cluster<?>> clusters = c.getAllClusters();
MeanVariance msil = new MeanVariance();
int ignorednoise = 0;
for (Cluster<?> cluster : clusters) {
// Note: we treat 1-element clusters the same as noise.
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
ignorednoise += cluster.size();
// Ignore noise elements
continue;
case TREAT_NOISE_AS_SINGLETONS:
// As suggested in Rousseeuw, we use 0 for singletons.
msil.put(0., cluster.size());
continue;
case MERGE_NOISE:
// Treat as cluster below
break;
}
}
ArrayDBIDs ids = DBIDUtil.ensureArray(cluster.getIDs());
// temporary storage.
double[] as = new double[ids.size()];
DBIDArrayIter it1 = ids.iter(), it2 = ids.iter();
for (it1.seek(0); it1.valid(); it1.advance()) {
// a: In-cluster distances
// Already computed distances
double a = as[it1.getOffset()];
for (it2.seek(it1.getOffset() + 1); it2.valid(); it2.advance()) {
final double dist = dq.distance(it1, it2);
a += dist;
as[it2.getOffset()] += dist;
}
a /= (ids.size() - 1);
// b: minimum average distance to other clusters:
double b = Double.POSITIVE_INFINITY;
for (Cluster<?> ocluster : clusters) {
if (ocluster == /* yes, reference identity */
cluster) {
// Same cluster
continue;
}
if (ocluster.size() <= 1 || ocluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
// Ignore noise elements
continue;
case TREAT_NOISE_AS_SINGLETONS:
// Treat noise cluster as singletons:
for (DBIDIter it3 = ocluster.getIDs().iter(); it3.valid(); it3.advance()) {
final double dist = dq.distance(it1, it3);
// Minimum average
b = dist < b ? dist : b;
}
continue;
case MERGE_NOISE:
// Treat as cluster below
break;
}
}
final DBIDs oids = ocluster.getIDs();
double btmp = 0.;
for (DBIDIter it3 = oids.iter(); it3.valid(); it3.advance()) {
btmp += dq.distance(it1, it3);
}
// Average
btmp /= oids.size();
// Minimum average
b = btmp < b ? btmp : b;
}
// One cluster only?
b = b < Double.POSITIVE_INFINITY ? b : a;
msil.put((b - a) / (b > a ? b : a));
}
}
double penalty = 1.;
// Only if {@link NoiseHandling#IGNORE_NOISE}:
if (penalize && ignorednoise > 0) {
penalty = (rel.size() - ignorednoise) / (double) rel.size();
}
final double meansil = penalty * msil.getMean();
final double stdsil = penalty * msil.getSampleStddev();
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(key + ".silhouette.noise-handling", noiseOption.toString()));
if (ignorednoise > 0) {
LOG.statistics(new LongStatistic(key + ".silhouette.noise", ignorednoise));
}
LOG.statistics(new DoubleStatistic(key + ".silhouette.mean", meansil));
LOG.statistics(new DoubleStatistic(key + ".silhouette.stddev", stdsil));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("Silhouette +-" + FormatUtil.NF2.format(stdsil), meansil, -1., 1., 0., false);
db.getHierarchy().resultChanged(ev);
return meansil;
}
use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.
the class EvaluateSimplifiedSilhouette method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param c Clustering
* @return Mean simplified silhouette
*/
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
List<? extends Cluster<?>> clusters = c.getAllClusters();
NumberVector[] centroids = new NumberVector[clusters.size()];
int ignorednoise = centroids(rel, clusters, centroids, noiseOption);
MeanVariance mssil = new MeanVariance();
Iterator<? extends Cluster<?>> ci = clusters.iterator();
for (int i = 0; ci.hasNext(); i++) {
Cluster<?> cluster = ci.next();
if (cluster.size() <= 1) {
// As suggested in Rousseeuw, we use 0 for singletons.
mssil.put(0., cluster.size());
continue;
}
if (cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
// Ignore elements
continue;
case TREAT_NOISE_AS_SINGLETONS:
// As suggested in Rousseeuw, we use 0 for singletons.
mssil.put(0., cluster.size());
continue;
case MERGE_NOISE:
// Treat as cluster below
break;
}
}
// Cluster center:
final NumberVector center = centroids[i];
assert (center != null);
for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
NumberVector obj = rel.get(it);
// a: Distance to own centroid
double a = distance.distance(center, obj);
// b: Distance to other clusters centroids:
double min = Double.POSITIVE_INFINITY;
Iterator<? extends Cluster<?>> cj = clusters.iterator();
for (int j = 0; cj.hasNext(); j++) {
Cluster<?> ocluster = cj.next();
if (i == j) {
continue;
}
NumberVector other = centroids[j];
if (other == null) {
// Noise!
switch(noiseOption) {
case IGNORE_NOISE:
continue;
case TREAT_NOISE_AS_SINGLETONS:
// Treat each object like a centroid!
for (DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
double dist = distance.distance(rel.get(it2), obj);
min = dist < min ? dist : min;
}
continue;
case MERGE_NOISE:
// Treat as cluster below, but should not be reachable.
break;
}
}
// Clusters: use centroid.
double dist = distance.distance(other, obj);
min = dist < min ? dist : min;
}
// One 'real' cluster only?
min = min < Double.POSITIVE_INFINITY ? min : a;
mssil.put((min - a) / (min > a ? min : a));
}
}
double penalty = 1.;
// Only if {@link NoiseHandling#IGNORE_NOISE}:
if (penalize && ignorednoise > 0) {
penalty = (rel.size() - ignorednoise) / (double) rel.size();
}
final double meanssil = penalty * mssil.getMean();
final double stdssil = penalty * mssil.getSampleStddev();
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(key + ".simplified-silhouette.noise-handling", noiseOption.toString()));
if (ignorednoise > 0) {
LOG.statistics(new LongStatistic(key + ".simplified-silhouette.ignored", ignorednoise));
}
LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.mean", meanssil));
LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.stddev", stdssil));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("Simp. Silhouette +-" + FormatUtil.NF2.format(stdssil), meanssil, -1., 1., 0., false);
db.getHierarchy().resultChanged(ev);
return meanssil;
}
Aggregations