Search in sources :

Example 6 with CovarianceMatrix

use of de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix in project elki by elki-project.

the class CTLuMedianMultipleAttributes method run.

/**
 * Run the algorithm
 *
 * @param database Database
 * @param spatial Spatial relation
 * @param attributes Attributes relation
 * @return Outlier detection result
 */
public OutlierResult run(Database database, Relation<N> spatial, Relation<O> attributes) {
    final int dim = RelationUtil.dimensionality(attributes);
    if (LOG.isDebugging()) {
        LOG.debug("Dimensionality: " + dim);
    }
    final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(database, spatial);
    CovarianceMatrix covmaker = new CovarianceMatrix(dim);
    WritableDataStore<double[]> deltas = DataStoreUtil.makeStorage(attributes.getDBIDs(), DataStoreFactory.HINT_TEMP, double[].class);
    for (DBIDIter iditer = attributes.iterDBIDs(); iditer.valid(); iditer.advance()) {
        final O obj = attributes.get(iditer);
        final DBIDs neighbors = npred.getNeighborDBIDs(iditer);
        // Compute the median vector
        final double[] median = new double[dim];
        {
            double[][] data = new double[dim][neighbors.size()];
            int i = 0;
            // Load data
            for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) {
                // TODO: skip object itself within neighbors?
                O nobj = attributes.get(iter);
                for (int d = 0; d < dim; d++) {
                    data[d][i] = nobj.doubleValue(d);
                }
                i++;
            }
            for (int d = 0; d < dim; d++) {
                median[d] = QuickSelect.median(data[d]);
            }
        }
        // Delta vector "h"
        double[] delta = minusEquals(obj.toArray(), median);
        deltas.put(iditer, delta);
        covmaker.put(delta);
    }
    // Finalize covariance matrix:
    double[] mean = covmaker.getMeanVector();
    double[][] cmati = inverse(covmaker.destroyToSampleMatrix());
    DoubleMinMax minmax = new DoubleMinMax();
    WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(attributes.getDBIDs(), DataStoreFactory.HINT_STATIC);
    for (DBIDIter iditer = attributes.iterDBIDs(); iditer.valid(); iditer.advance()) {
        final double score = mahalanobisDistance(cmati, deltas.get(iditer), mean);
        minmax.put(score);
        scores.putDouble(iditer, score);
    }
    DoubleRelation scoreResult = new MaterializedDoubleRelation("Median multiple attributes outlier", "median-outlier", scores, attributes.getDBIDs());
    OutlierScoreMeta scoreMeta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0);
    OutlierResult or = new OutlierResult(scoreMeta, scoreResult);
    or.addChildResult(npred);
    return or;
}
Also used : WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) OutlierResult(de.lmu.ifi.dbs.elki.result.outlier.OutlierResult) NeighborSetPredicate(de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood.NeighborSetPredicate) DoubleRelation(de.lmu.ifi.dbs.elki.database.relation.DoubleRelation) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation) BasicOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta) OutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta) BasicOutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleMinMax(de.lmu.ifi.dbs.elki.math.DoubleMinMax) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)

Example 7 with CovarianceMatrix

use of de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix in project elki by elki-project.

the class GlobalPrincipalComponentAnalysisTransform method prepareStart.

@Override
protected boolean prepareStart(SimpleTypeInformation<O> in) {
    if (!(in instanceof VectorFieldTypeInformation)) {
        throw new AbortException("PCA can only applied to fixed dimensionality vectors");
    }
    dim = ((VectorFieldTypeInformation<?>) in).getDimensionality();
    covmat = new CovarianceMatrix(dim);
    return true;
}
Also used : VectorFieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix)

Example 8 with CovarianceMatrix

use of de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix in project elki by elki-project.

the class P3C method run.

/**
 * Performs the P3C algorithm on the given Database.
 */
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    // Overall progress.
    StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
    if (stepProgress != null) {
        stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
    }
    // Desired number of bins, as per Sturge:
    final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
    // Perform 1-dimensional projections, and split into bins.
    SetDBIDs[][] partitions = partitionData(relation, binCount);
    if (stepProgress != null) {
        stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
    }
    // Set markers for each attribute until they're all deemed uniform.
    final long[][] markers = new long[dim][];
    for (int d = 0; d < dim; d++) {
        final SetDBIDs[] parts = partitions[d];
        if (parts == null) {
            // Never mark any on constant dimensions.
            continue;
        }
        final long[] marked = markers[d] = BitsUtil.zero(binCount);
        int card = 0;
        while (card < dim - 1) {
            // Find bin with largest support, test only the dimensions that were not
            // previously marked.
            int bestBin = chiSquaredUniformTest(parts, marked, card);
            if (bestBin < 0) {
                // Uniform
                break;
            }
            BitsUtil.setI(marked, bestBin);
            card++;
        }
        if (LOG.isDebugging()) {
            LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
        }
    }
    if (stepProgress != null) {
        stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
    }
    ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
    if (stepProgress != null) {
        stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
    }
    ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
    if (stepProgress != null) {
        stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
    }
    clusterCores = pruneRedundantClusterCores(clusterCores);
    if (LOG.isVerbose()) {
        LOG.verbose("Number of cluster cores found: " + clusterCores.size());
    }
    if (clusterCores.isEmpty()) {
        LOG.setCompleted(stepProgress);
        Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
        c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
        return c;
    }
    if (stepProgress != null) {
        stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
    }
    // Track objects not assigned to any cluster:
    ModifiableDBIDs noise = DBIDUtil.newHashSet();
    WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
    int k = clusterCores.size();
    List<MultivariateGaussianModel> models = new ArrayList<>(k);
    computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
    // Initial estimate of covariances, to assign noise objects
    EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
    assignUnassigned(relation, probClusterIGivenX, models, noise);
    double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
    for (int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
        final double emOld = emNew;
        EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
        // reassign probabilities
        emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
        if (LOG.isVerbose()) {
            LOG.verbose("iteration " + it + " - expectation value: " + emNew);
        }
        if ((emNew - emOld) <= emDelta) {
            break;
        }
    }
    if (stepProgress != null) {
        stepProgress.beginStep(6, "Generating hard clustering.", LOG);
    }
    // Create a hard clustering, making sure each data point only is part of one
    // cluster, based on the best match from the membership matrix.
    ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
    if (stepProgress != null) {
        stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
    }
    // Outlier detection. Remove points from clusters that have a Mahalanobis
    // distance larger than the critical value of the ChiSquare distribution.
    findOutliers(relation, models, clusterCandidates, noise);
    if (stepProgress != null) {
        stepProgress.beginStep(8, "Removing empty clusters.", LOG);
    }
    // Remove near-empty clusters.
    for (Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext(); ) {
        ClusterCandidate cand = it.next();
        final int size = cand.ids.size();
        if (size < minClusterSize) {
            if (size > 0) {
                noise.addDBIDs(cand.ids);
            }
            it.remove();
        }
    }
    if (LOG.isVerbose()) {
        LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
    }
    if (stepProgress != null) {
        stepProgress.beginStep(9, "Generating final result.", LOG);
    }
    // Generate final output.
    Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
    for (int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
        ClusterCandidate candidate = clusterCandidates.get(cluster);
        CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
        result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
    }
    LOG.verbose("Noise size: " + noise.size());
    if (noise.size() > 0) {
        result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
    }
    LOG.ensureCompleted(stepProgress);
    return result;
}
Also used : ArrayList(java.util.ArrayList) MultivariateGaussianModel(de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) SetDBIDs(de.lmu.ifi.dbs.elki.database.ids.SetDBIDs) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 9 with CovarianceMatrix

use of de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix in project elki by elki-project.

the class EvaluateRankingQuality method run.

@Override
public HistogramResult run(Database database) {
    final Relation<V> relation = database.getRelation(getInputTypeRestriction()[0]);
    final DistanceQuery<V> distQuery = database.getDistanceQuery(relation, getDistanceFunction());
    final KNNQuery<V> knnQuery = database.getKNNQuery(distQuery, relation.size());
    if (LOG.isVerbose()) {
        LOG.verbose("Preprocessing clusters...");
    }
    // Cluster by labels
    Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters();
    // Compute cluster averages and covariance matrix
    HashMap<Cluster<?>, double[]> averages = new HashMap<>(split.size());
    HashMap<Cluster<?>, double[][]> covmats = new HashMap<>(split.size());
    for (Cluster<?> clus : split) {
        CovarianceMatrix covmat = CovarianceMatrix.make(relation, clus.getIDs());
        averages.put(clus, covmat.getMeanVector());
        covmats.put(clus, covmat.destroyToPopulationMatrix());
    }
    MeanVarianceStaticHistogram hist = new MeanVarianceStaticHistogram(numbins, 0.0, 1.0);
    if (LOG.isVerbose()) {
        LOG.verbose("Processing points...");
    }
    FiniteProgress rocloop = LOG.isVerbose() ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG) : null;
    ROCEvaluation roc = new ROCEvaluation();
    // sort neighbors
    for (Cluster<?> clus : split) {
        ModifiableDoubleDBIDList cmem = DBIDUtil.newDistanceDBIDList(clus.size());
        double[] av = averages.get(clus);
        double[][] covm = covmats.get(clus);
        for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) {
            double d = mahalanobisDistance(covm, relation.get(iter).toArray(), av);
            cmem.add(d, iter);
        }
        cmem.sort();
        for (DBIDArrayIter it = cmem.iter(); it.valid(); it.advance()) {
            KNNList knn = knnQuery.getKNNForDBID(it, relation.size());
            double result = EvaluateClustering.evaluateRanking(roc, clus, knn);
            hist.put(((double) it.getOffset()) / clus.size(), result);
            LOG.incrementProcessed(rocloop);
        }
    }
    LOG.ensureCompleted(rocloop);
    // Collections.sort(results);
    // Transform Histogram into a Double Vector array.
    Collection<double[]> res = new ArrayList<>(relation.size());
    for (ObjHistogram.Iter<MeanVariance> iter = hist.iter(); iter.valid(); iter.advance()) {
        res.add(new double[] { iter.getCenter(), iter.getValue().getCount(), iter.getValue().getMean(), iter.getValue().getSampleVariance() });
    }
    return new HistogramResult("Ranking Quality Histogram", "ranking-histogram", res);
}
Also used : ObjHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram) HistogramResult(de.lmu.ifi.dbs.elki.result.HistogramResult) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) ROCEvaluation(de.lmu.ifi.dbs.elki.evaluation.scores.ROCEvaluation) ModifiableDoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) MeanVarianceStaticHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.MeanVarianceStaticHistogram) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) ByLabelOrAllInOneClustering(de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) KNNList(de.lmu.ifi.dbs.elki.database.ids.KNNList)

Example 10 with CovarianceMatrix

use of de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix in project elki by elki-project.

the class RANSACCovarianceMatrixBuilder method processIds.

@// 
Reference(// 
title = "Random sample consensus: a paradigm for model fitting with applications to image analysis and automated cartography", // 
authors = "M.A. Fischler, R.C. Bolles", // 
booktitle = "Communications of the ACM, Vol. 24 Issue 6", url = "http://dx.doi.org/10.1145/358669.358692")
@Override
public double[][] processIds(DBIDs ids, Relation<? extends NumberVector> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    ModifiableDBIDs best = DBIDUtil.newHashSet(), support = DBIDUtil.newHashSet();
    double tresh = ChiSquaredDistribution.quantile(0.85, dim);
    CovarianceMatrix cv = new CovarianceMatrix(dim);
    Random random = rnd.getSingleThreadedRandom();
    for (int i = 0; i < iterations; i++) {
        DBIDs sample = DBIDUtil.randomSample(ids, dim + 1, random);
        cv.reset();
        for (DBIDIter it = sample.iter(); it.valid(); it.advance()) {
            cv.put(relation.get(it));
        }
        double[] centroid = cv.getMeanVector();
        double[][] p = inverse(cv.destroyToSampleMatrix());
        support.clear();
        for (DBIDIter id = ids.iter(); id.valid(); id.advance()) {
            double[] vec = minusEquals(relation.get(id).toArray(), centroid);
            double sqlen = transposeTimesTimes(vec, p, vec);
            if (sqlen < tresh) {
                support.add(id);
            }
        }
        if (support.size() > best.size()) {
            ModifiableDBIDs swap = best;
            best = support;
            support = swap;
        }
        if (support.size() >= ids.size()) {
            // Can't get better than this!
            break;
        }
    }
    // Fall back to regular PCA if too few samples.
    return CovarianceMatrix.make(relation, best.size() > dim ? best : ids).destroyToSampleMatrix();
}
Also used : Random(java.util.Random) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) Reference(de.lmu.ifi.dbs.elki.utilities.documentation.Reference)

Aggregations

CovarianceMatrix (de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix)15 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)8 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)4 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)4 DoubleRelation (de.lmu.ifi.dbs.elki.database.relation.DoubleRelation)4 MaterializedDoubleRelation (de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)4 DoubleMinMax (de.lmu.ifi.dbs.elki.math.DoubleMinMax)4 BasicOutlierScoreMeta (de.lmu.ifi.dbs.elki.result.outlier.BasicOutlierScoreMeta)4 OutlierResult (de.lmu.ifi.dbs.elki.result.outlier.OutlierResult)4 OutlierScoreMeta (de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta)4 NeighborSetPredicate (de.lmu.ifi.dbs.elki.algorithm.outlier.spatial.neighborhood.NeighborSetPredicate)3 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)3 AbstractDataSourceTest (de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest)3 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)3 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)3 LUDecomposition (de.lmu.ifi.dbs.elki.math.linearalgebra.LUDecomposition)3 ArrayList (java.util.ArrayList)3 Test (org.junit.Test)3 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)2 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)2