Search in sources :

Example 41 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class CTLuMoranScatterplotOutlier method run.

/**
 * Main method.
 *
 * @param database Database
 * @param nrel Neighborhood relation
 * @param relation Data relation (1d!)
 * @return Outlier detection result
 */
public OutlierResult run(Database database, Relation<N> nrel, Relation<? extends NumberVector> relation) {
    final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(database, nrel);
    // Compute the global mean and variance
    MeanVariance globalmv = new MeanVariance();
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        globalmv.put(relation.get(iditer).doubleValue(0));
    }
    DoubleMinMax minmax = new DoubleMinMax();
    WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_STATIC);
    // calculate neighborhood average of normalized attribute values.
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        // Compute global z score
        final double globalZ = (relation.get(iditer).doubleValue(0) - globalmv.getMean()) / globalmv.getNaiveStddev();
        // Compute local average z score
        Mean localm = new Mean();
        for (DBIDIter iter = npred.getNeighborDBIDs(iditer).iter(); iter.valid(); iter.advance()) {
            if (DBIDUtil.equal(iditer, iter)) {
                continue;
            }
            localm.put((relation.get(iter).doubleValue(0) - globalmv.getMean()) / globalmv.getNaiveStddev());
        }
        // if neighors.size == 0
        final double localZ;
        if (localm.getCount() > 0) {
            localZ = localm.getMean();
        } else {
            // if s has no neighbors => Wzi = zi
            localZ = globalZ;
        }
        // compute score
        // Note: in the original moran scatterplot, any object with a score < 0 would be an outlier.
        final double score = Math.max(-globalZ * localZ, 0);
        minmax.put(score);
        scores.putDouble(iditer, score);
    }
    DoubleRelation scoreResult = new MaterializedDoubleRelation("MoranOutlier", "Moran Scatterplot Outlier", scores, relation.getDBIDs());
    OutlierScoreMeta scoreMeta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax(), Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, 0);
    OutlierResult or = new OutlierResult(scoreMeta, scoreResult);
    or.addChildResult(npred);
    return or;
}
Also used : Mean(de.lmu.ifi.dbs.elki.math.Mean) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) DoubleMinMax(de.lmu.ifi.dbs.elki.math.DoubleMinMax) WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) OutlierResult(de.lmu.ifi.dbs.elki.result.outlier.OutlierResult) NeighborSetPredicate(de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood.NeighborSetPredicate) DoubleRelation(de.lmu.ifi.dbs.elki.database.relation.DoubleRelation) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation) BasicOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta) OutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta) BasicOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 42 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class FastABOD method run.

/**
 * Run Fast-ABOD on the data set.
 *
 * @param relation Relation to process
 * @return Outlier detection result
 */
@Override
public OutlierResult run(Database db, Relation<V> relation) {
    DBIDs ids = relation.getDBIDs();
    // Build a kernel matrix, to make O(n^3) slightly less bad.
    SimilarityQuery<V> sq = db.getSimilarityQuery(relation, kernelFunction);
    KernelMatrix kernelMatrix = new KernelMatrix(sq, relation, ids);
    WritableDoubleDataStore abodvalues = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
    DoubleMinMax minmaxabod = new DoubleMinMax();
    MeanVariance s = new MeanVariance();
    KNNHeap nn = DBIDUtil.newHeap(k);
    for (DBIDIter pA = ids.iter(); pA.valid(); pA.advance()) {
        final double simAA = kernelMatrix.getSimilarity(pA, pA);
        // Choose the k-min nearest
        nn.clear();
        for (DBIDIter nB = relation.iterDBIDs(); nB.valid(); nB.advance()) {
            if (DBIDUtil.equal(nB, pA)) {
                continue;
            }
            double simBB = kernelMatrix.getSimilarity(nB, nB);
            double simAB = kernelMatrix.getSimilarity(pA, nB);
            double sqdAB = simAA + simBB - simAB - simAB;
            if (!(sqdAB > 0.)) {
                continue;
            }
            nn.insert(sqdAB, nB);
        }
        KNNList nl = nn.toKNNList();
        s.reset();
        DoubleDBIDListIter iB = nl.iter(), iC = nl.iter();
        for (; iB.valid(); iB.advance()) {
            double sqdAB = iB.doubleValue();
            double simAB = kernelMatrix.getSimilarity(pA, iB);
            if (!(sqdAB > 0.)) {
                continue;
            }
            for (iC.seek(iB.getOffset() + 1); iC.valid(); iC.advance()) {
                double sqdAC = iC.doubleValue();
                double simAC = kernelMatrix.getSimilarity(pA, iC);
                if (!(sqdAC > 0.)) {
                    continue;
                }
                // Exploit bilinearity of scalar product:
                // <B-A, C-A> = <B, C-A> - <A,C-A>
                // = <B,C> - <B,A> - <A,C> + <A,A>
                double simBC = kernelMatrix.getSimilarity(iB, iC);
                double numerator = simBC - simAB - simAC + simAA;
                double div = 1. / (sqdAB * sqdAC);
                s.put(numerator * div, FastMath.sqrt(div));
            }
        }
        // Sample variance probably would probably be better, but the ABOD
        // publication uses the naive variance.
        final double abof = s.getNaiveVariance();
        minmaxabod.put(abof);
        abodvalues.putDouble(pA, abof);
    }
    // Build result representation.
    DoubleRelation scoreResult = new MaterializedDoubleRelation("Angle-Based Outlier Degree", "abod-outlier", abodvalues, relation.getDBIDs());
    OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(minmaxabod.getMin(), minmaxabod.getMax(), 0.0, Double.POSITIVE_INFINITY);
    return new OutlierResult(scoreMeta, scoreResult);
}
Also used : DoubleDBIDListIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter) WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) OutlierResult(de.lmu.ifi.dbs.elki.result.outlier.OutlierResult) InvertedOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta) KNNHeap(de.lmu.ifi.dbs.elki.database.ids.KNNHeap) DoubleRelation(de.lmu.ifi.dbs.elki.database.relation.DoubleRelation) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation) OutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta) InvertedOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.InvertedOutlierScoreMeta) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) KernelMatrix(de.lmu.ifi.dbs.elki.distance.similarityfunction.kernel.KernelMatrix) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) DoubleMinMax(de.lmu.ifi.dbs.elki.math.DoubleMinMax) KNNList(de.lmu.ifi.dbs.elki.database.ids.KNNList) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)

Example 43 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class P3C method chiSquaredUniformTest.

/**
 * Performs a ChiSquared test to determine whether an attribute has a uniform
 * distribution.
 *
 * @param parts Data partitions.
 * @param marked the marked bins that should be ignored.
 * @param card Cardinality
 * @return Position of maximum, or -1 when uniform.
 */
private int chiSquaredUniformTest(SetDBIDs[] parts, long[] marked, int card) {
    // Get global mean over all unmarked bins.
    int max = 0, maxpos = -1;
    MeanVariance mv = new MeanVariance();
    for (int i = 0; i < parts.length; i++) {
        // Ignore already marked bins.
        if (BitsUtil.get(marked, i)) {
            continue;
        }
        final int binSupport = parts[i].size();
        mv.put(binSupport);
        if (binSupport > max) {
            max = binSupport;
            maxpos = i;
        }
    }
    if (mv.getCount() < 1. || !(mv.getNaiveVariance() > 0.)) {
        return -1;
    }
    // ChiSquare statistic is the naive variance of the sizes!
    final double chiSquare = mv.getNaiveVariance() / mv.getMean();
    final int binCount = parts.length - card;
    final double test = ChiSquaredDistribution.cdf(chiSquare, Math.max(1, binCount - card - 1));
    return ((1. - alpha) < test) ? maxpos : -1;
}
Also used : MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance)

Example 44 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class EvaluateSilhouette method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param dq Distance query
 * @param c Clustering
 * @return Average silhouette
 */
public double evaluateClustering(Database db, Relation<O> rel, DistanceQuery<O> dq, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    MeanVariance msil = new MeanVariance();
    int ignorednoise = 0;
    for (Cluster<?> cluster : clusters) {
        // Note: we treat 1-element clusters the same as noise.
        if (cluster.size() <= 1 || cluster.isNoise()) {
            switch(noiseOption) {
                case IGNORE_NOISE:
                    ignorednoise += cluster.size();
                    // Ignore noise elements
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    // As suggested in Rousseeuw, we use 0 for singletons.
                    msil.put(0., cluster.size());
                    continue;
                case MERGE_NOISE:
                    // Treat as cluster below
                    break;
            }
        }
        ArrayDBIDs ids = DBIDUtil.ensureArray(cluster.getIDs());
        // temporary storage.
        double[] as = new double[ids.size()];
        DBIDArrayIter it1 = ids.iter(), it2 = ids.iter();
        for (it1.seek(0); it1.valid(); it1.advance()) {
            // a: In-cluster distances
            // Already computed distances
            double a = as[it1.getOffset()];
            for (it2.seek(it1.getOffset() + 1); it2.valid(); it2.advance()) {
                final double dist = dq.distance(it1, it2);
                a += dist;
                as[it2.getOffset()] += dist;
            }
            a /= (ids.size() - 1);
            // b: minimum average distance to other clusters:
            double b = Double.POSITIVE_INFINITY;
            for (Cluster<?> ocluster : clusters) {
                if (ocluster == /* yes, reference identity */
                cluster) {
                    // Same cluster
                    continue;
                }
                if (ocluster.size() <= 1 || ocluster.isNoise()) {
                    switch(noiseOption) {
                        case IGNORE_NOISE:
                            // Ignore noise elements
                            continue;
                        case TREAT_NOISE_AS_SINGLETONS:
                            // Treat noise cluster as singletons:
                            for (DBIDIter it3 = ocluster.getIDs().iter(); it3.valid(); it3.advance()) {
                                final double dist = dq.distance(it1, it3);
                                // Minimum average
                                b = dist < b ? dist : b;
                            }
                            continue;
                        case MERGE_NOISE:
                            // Treat as cluster below
                            break;
                    }
                }
                final DBIDs oids = ocluster.getIDs();
                double btmp = 0.;
                for (DBIDIter it3 = oids.iter(); it3.valid(); it3.advance()) {
                    btmp += dq.distance(it1, it3);
                }
                // Average
                btmp /= oids.size();
                // Minimum average
                b = btmp < b ? btmp : b;
            }
            // One cluster only?
            b = b < Double.POSITIVE_INFINITY ? b : a;
            msil.put((b - a) / (b > a ? b : a));
        }
    }
    double penalty = 1.;
    // Only if {@link NoiseHandling#IGNORE_NOISE}:
    if (penalize && ignorednoise > 0) {
        penalty = (rel.size() - ignorednoise) / (double) rel.size();
    }
    final double meansil = penalty * msil.getMean();
    final double stdsil = penalty * msil.getSampleStddev();
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".silhouette.noise-handling", noiseOption.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".silhouette.noise", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".silhouette.mean", meansil));
        LOG.statistics(new DoubleStatistic(key + ".silhouette.stddev", stdsil));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("Silhouette +-" + FormatUtil.NF2.format(stdsil), meansil, -1., 1., 0., false);
    db.getHierarchy().resultChanged(ev);
    return meansil;
}
Also used : ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)

Example 45 with MeanVariance

use of de.lmu.ifi.dbs.elki.math.MeanVariance in project elki by elki-project.

the class EvaluateSimplifiedSilhouette method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return Mean simplified silhouette
 */
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    NumberVector[] centroids = new NumberVector[clusters.size()];
    int ignorednoise = centroids(rel, clusters, centroids, noiseOption);
    MeanVariance mssil = new MeanVariance();
    Iterator<? extends Cluster<?>> ci = clusters.iterator();
    for (int i = 0; ci.hasNext(); i++) {
        Cluster<?> cluster = ci.next();
        if (cluster.size() <= 1) {
            // As suggested in Rousseeuw, we use 0 for singletons.
            mssil.put(0., cluster.size());
            continue;
        }
        if (cluster.isNoise()) {
            switch(noiseOption) {
                case IGNORE_NOISE:
                    // Ignore elements
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    // As suggested in Rousseeuw, we use 0 for singletons.
                    mssil.put(0., cluster.size());
                    continue;
                case MERGE_NOISE:
                    // Treat as cluster below
                    break;
            }
        }
        // Cluster center:
        final NumberVector center = centroids[i];
        assert (center != null);
        for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
            NumberVector obj = rel.get(it);
            // a: Distance to own centroid
            double a = distance.distance(center, obj);
            // b: Distance to other clusters centroids:
            double min = Double.POSITIVE_INFINITY;
            Iterator<? extends Cluster<?>> cj = clusters.iterator();
            for (int j = 0; cj.hasNext(); j++) {
                Cluster<?> ocluster = cj.next();
                if (i == j) {
                    continue;
                }
                NumberVector other = centroids[j];
                if (other == null) {
                    // Noise!
                    switch(noiseOption) {
                        case IGNORE_NOISE:
                            continue;
                        case TREAT_NOISE_AS_SINGLETONS:
                            // Treat each object like a centroid!
                            for (DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
                                double dist = distance.distance(rel.get(it2), obj);
                                min = dist < min ? dist : min;
                            }
                            continue;
                        case MERGE_NOISE:
                            // Treat as cluster below, but should not be reachable.
                            break;
                    }
                }
                // Clusters: use centroid.
                double dist = distance.distance(other, obj);
                min = dist < min ? dist : min;
            }
            // One 'real' cluster only?
            min = min < Double.POSITIVE_INFINITY ? min : a;
            mssil.put((min - a) / (min > a ? min : a));
        }
    }
    double penalty = 1.;
    // Only if {@link NoiseHandling#IGNORE_NOISE}:
    if (penalize && ignorednoise > 0) {
        penalty = (rel.size() - ignorednoise) / (double) rel.size();
    }
    final double meanssil = penalty * mssil.getMean();
    final double stdssil = penalty * mssil.getSampleStddev();
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".simplified-silhouette.noise-handling", noiseOption.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".simplified-silhouette.ignored", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.mean", meanssil));
        LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.stddev", stdssil));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("Simp. Silhouette +-" + FormatUtil.NF2.format(stdssil), meanssil, -1., 1., 0., false);
    db.getHierarchy().resultChanged(ev);
    return meanssil;
}
Also used : MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Aggregations

MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)61 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)32 DoubleRelation (de.lmu.ifi.dbs.elki.database.relation.DoubleRelation)17 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)17 DoubleMinMax (de.lmu.ifi.dbs.elki.math.DoubleMinMax)15 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)13 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)9 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)9 MaterializedDoubleRelation (de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)9 OutlierResult (de.lmu.ifi.dbs.elki.result.outlier.OutlierResult)9 OutlierScoreMeta (de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta)9 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)8 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)8 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)7 Mean (de.lmu.ifi.dbs.elki.math.Mean)7 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)6 DoubleDBIDListIter (de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter)6 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)5 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)5 AbstractDataSourceTest (de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest)5