use of de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix in project elki by elki-project.
the class CTLuMedianMultipleAttributes method run.
/**
* Run the algorithm
*
* @param database Database
* @param spatial Spatial relation
* @param attributes Attributes relation
* @return Outlier detection result
*/
public OutlierResult run(Database database, Relation<N> spatial, Relation<O> attributes) {
final int dim = RelationUtil.dimensionality(attributes);
if (LOG.isDebugging()) {
LOG.debug("Dimensionality: " + dim);
}
final NeighborSetPredicate npred = getNeighborSetPredicateFactory().instantiate(database, spatial);
CovarianceMatrix covmaker = new CovarianceMatrix(dim);
WritableDataStore<double[]> deltas = DataStoreUtil.makeStorage(attributes.getDBIDs(), DataStoreFactory.HINT_TEMP, double[].class);
for (DBIDIter iditer = attributes.iterDBIDs(); iditer.valid(); iditer.advance()) {
final O obj = attributes.get(iditer);
final DBIDs neighbors = npred.getNeighborDBIDs(iditer);
// Compute the median vector
final double[] median = new double[dim];
{
double[][] data = new double[dim][neighbors.size()];
int i = 0;
// Load data
for (DBIDIter iter = neighbors.iter(); iter.valid(); iter.advance()) {
// TODO: skip object itself within neighbors?
O nobj = attributes.get(iter);
for (int d = 0; d < dim; d++) {
data[d][i] = nobj.doubleValue(d);
}
i++;
}
for (int d = 0; d < dim; d++) {
median[d] = QuickSelect.median(data[d]);
}
}
// Delta vector "h"
double[] delta = minusEquals(obj.toArray(), median);
deltas.put(iditer, delta);
covmaker.put(delta);
}
// Finalize covariance matrix:
double[] mean = covmaker.getMeanVector();
double[][] cmati = inverse(covmaker.destroyToSampleMatrix());
DoubleMinMax minmax = new DoubleMinMax();
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(attributes.getDBIDs(), DataStoreFactory.HINT_STATIC);
for (DBIDIter iditer = attributes.iterDBIDs(); iditer.valid(); iditer.advance()) {
final double score = mahalanobisDistance(cmati, deltas.get(iditer), mean);
minmax.put(score);
scores.putDouble(iditer, score);
}
DoubleRelation scoreResult = new MaterializedDoubleRelation("Median multiple attributes outlier", "median-outlier", scores, attributes.getDBIDs());
OutlierScoreMeta scoreMeta = new BasicOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 0);
OutlierResult or = new OutlierResult(scoreMeta, scoreResult);
or.addChildResult(npred);
return or;
}
use of de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix in project elki by elki-project.
the class GlobalPrincipalComponentAnalysisTransform method prepareStart.
@Override
protected boolean prepareStart(SimpleTypeInformation<O> in) {
if (!(in instanceof VectorFieldTypeInformation)) {
throw new AbortException("PCA can only applied to fixed dimensionality vectors");
}
dim = ((VectorFieldTypeInformation<?>) in).getDimensionality();
covmat = new CovarianceMatrix(dim);
return true;
}
use of de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix in project elki by elki-project.
the class P3C method run.
/**
* Performs the P3C algorithm on the given Database.
*/
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
final int dim = RelationUtil.dimensionality(relation);
// Overall progress.
StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
if (stepProgress != null) {
stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
}
// Desired number of bins, as per Sturge:
final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
// Perform 1-dimensional projections, and split into bins.
SetDBIDs[][] partitions = partitionData(relation, binCount);
if (stepProgress != null) {
stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
}
// Set markers for each attribute until they're all deemed uniform.
final long[][] markers = new long[dim][];
for (int d = 0; d < dim; d++) {
final SetDBIDs[] parts = partitions[d];
if (parts == null) {
// Never mark any on constant dimensions.
continue;
}
final long[] marked = markers[d] = BitsUtil.zero(binCount);
int card = 0;
while (card < dim - 1) {
// Find bin with largest support, test only the dimensions that were not
// previously marked.
int bestBin = chiSquaredUniformTest(parts, marked, card);
if (bestBin < 0) {
// Uniform
break;
}
BitsUtil.setI(marked, bestBin);
card++;
}
if (LOG.isDebugging()) {
LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
}
}
if (stepProgress != null) {
stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
}
ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
if (stepProgress != null) {
stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
}
ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
if (stepProgress != null) {
stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
}
clusterCores = pruneRedundantClusterCores(clusterCores);
if (LOG.isVerbose()) {
LOG.verbose("Number of cluster cores found: " + clusterCores.size());
}
if (clusterCores.isEmpty()) {
LOG.setCompleted(stepProgress);
Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
return c;
}
if (stepProgress != null) {
stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
}
// Track objects not assigned to any cluster:
ModifiableDBIDs noise = DBIDUtil.newHashSet();
WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
int k = clusterCores.size();
List<MultivariateGaussianModel> models = new ArrayList<>(k);
computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
// Initial estimate of covariances, to assign noise objects
EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
assignUnassigned(relation, probClusterIGivenX, models, noise);
double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
for (int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
final double emOld = emNew;
EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
// reassign probabilities
emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
if (LOG.isVerbose()) {
LOG.verbose("iteration " + it + " - expectation value: " + emNew);
}
if ((emNew - emOld) <= emDelta) {
break;
}
}
if (stepProgress != null) {
stepProgress.beginStep(6, "Generating hard clustering.", LOG);
}
// Create a hard clustering, making sure each data point only is part of one
// cluster, based on the best match from the membership matrix.
ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
if (stepProgress != null) {
stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
}
// Outlier detection. Remove points from clusters that have a Mahalanobis
// distance larger than the critical value of the ChiSquare distribution.
findOutliers(relation, models, clusterCandidates, noise);
if (stepProgress != null) {
stepProgress.beginStep(8, "Removing empty clusters.", LOG);
}
// Remove near-empty clusters.
for (Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext(); ) {
ClusterCandidate cand = it.next();
final int size = cand.ids.size();
if (size < minClusterSize) {
if (size > 0) {
noise.addDBIDs(cand.ids);
}
it.remove();
}
}
if (LOG.isVerbose()) {
LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
}
if (stepProgress != null) {
stepProgress.beginStep(9, "Generating final result.", LOG);
}
// Generate final output.
Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
for (int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
ClusterCandidate candidate = clusterCandidates.get(cluster);
CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
}
LOG.verbose("Noise size: " + noise.size());
if (noise.size() > 0) {
result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
}
LOG.ensureCompleted(stepProgress);
return result;
}
use of de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix in project elki by elki-project.
the class EvaluateRankingQuality method run.
@Override
public HistogramResult run(Database database) {
final Relation<V> relation = database.getRelation(getInputTypeRestriction()[0]);
final DistanceQuery<V> distQuery = database.getDistanceQuery(relation, getDistanceFunction());
final KNNQuery<V> knnQuery = database.getKNNQuery(distQuery, relation.size());
if (LOG.isVerbose()) {
LOG.verbose("Preprocessing clusters...");
}
// Cluster by labels
Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters();
// Compute cluster averages and covariance matrix
HashMap<Cluster<?>, double[]> averages = new HashMap<>(split.size());
HashMap<Cluster<?>, double[][]> covmats = new HashMap<>(split.size());
for (Cluster<?> clus : split) {
CovarianceMatrix covmat = CovarianceMatrix.make(relation, clus.getIDs());
averages.put(clus, covmat.getMeanVector());
covmats.put(clus, covmat.destroyToPopulationMatrix());
}
MeanVarianceStaticHistogram hist = new MeanVarianceStaticHistogram(numbins, 0.0, 1.0);
if (LOG.isVerbose()) {
LOG.verbose("Processing points...");
}
FiniteProgress rocloop = LOG.isVerbose() ? new FiniteProgress("Computing ROC AUC values", relation.size(), LOG) : null;
ROCEvaluation roc = new ROCEvaluation();
// sort neighbors
for (Cluster<?> clus : split) {
ModifiableDoubleDBIDList cmem = DBIDUtil.newDistanceDBIDList(clus.size());
double[] av = averages.get(clus);
double[][] covm = covmats.get(clus);
for (DBIDIter iter = clus.getIDs().iter(); iter.valid(); iter.advance()) {
double d = mahalanobisDistance(covm, relation.get(iter).toArray(), av);
cmem.add(d, iter);
}
cmem.sort();
for (DBIDArrayIter it = cmem.iter(); it.valid(); it.advance()) {
KNNList knn = knnQuery.getKNNForDBID(it, relation.size());
double result = EvaluateClustering.evaluateRanking(roc, clus, knn);
hist.put(((double) it.getOffset()) / clus.size(), result);
LOG.incrementProcessed(rocloop);
}
}
LOG.ensureCompleted(rocloop);
// Collections.sort(results);
// Transform Histogram into a Double Vector array.
Collection<double[]> res = new ArrayList<>(relation.size());
for (ObjHistogram.Iter<MeanVariance> iter = hist.iter(); iter.valid(); iter.advance()) {
res.add(new double[] { iter.getCenter(), iter.getValue().getCount(), iter.getValue().getMean(), iter.getValue().getSampleVariance() });
}
return new HistogramResult("Ranking Quality Histogram", "ranking-histogram", res);
}
use of de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix in project elki by elki-project.
the class RANSACCovarianceMatrixBuilder method processIds.
@//
Reference(//
title = "Random sample consensus: a paradigm for model fitting with applications to image analysis and automated cartography", //
authors = "M.A. Fischler, R.C. Bolles", //
booktitle = "Communications of the ACM, Vol. 24 Issue 6", url = "http://dx.doi.org/10.1145/358669.358692")
@Override
public double[][] processIds(DBIDs ids, Relation<? extends NumberVector> relation) {
final int dim = RelationUtil.dimensionality(relation);
ModifiableDBIDs best = DBIDUtil.newHashSet(), support = DBIDUtil.newHashSet();
double tresh = ChiSquaredDistribution.quantile(0.85, dim);
CovarianceMatrix cv = new CovarianceMatrix(dim);
Random random = rnd.getSingleThreadedRandom();
for (int i = 0; i < iterations; i++) {
DBIDs sample = DBIDUtil.randomSample(ids, dim + 1, random);
cv.reset();
for (DBIDIter it = sample.iter(); it.valid(); it.advance()) {
cv.put(relation.get(it));
}
double[] centroid = cv.getMeanVector();
double[][] p = inverse(cv.destroyToSampleMatrix());
support.clear();
for (DBIDIter id = ids.iter(); id.valid(); id.advance()) {
double[] vec = minusEquals(relation.get(id).toArray(), centroid);
double sqlen = transposeTimesTimes(vec, p, vec);
if (sqlen < tresh) {
support.add(id);
}
}
if (support.size() > best.size()) {
ModifiableDBIDs swap = best;
best = support;
support = swap;
}
if (support.size() >= ids.size()) {
// Can't get better than this!
break;
}
}
// Fall back to regular PCA if too few samples.
return CovarianceMatrix.make(relation, best.size() > dim ? best : ids).destroyToSampleMatrix();
}
Aggregations