use of de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel in project elki by elki-project.
the class P3C method run.
/**
* Performs the P3C algorithm on the given Database.
*/
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
final int dim = RelationUtil.dimensionality(relation);
// Overall progress.
StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
if (stepProgress != null) {
stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
}
// Desired number of bins, as per Sturge:
final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
// Perform 1-dimensional projections, and split into bins.
SetDBIDs[][] partitions = partitionData(relation, binCount);
if (stepProgress != null) {
stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
}
// Set markers for each attribute until they're all deemed uniform.
final long[][] markers = new long[dim][];
for (int d = 0; d < dim; d++) {
final SetDBIDs[] parts = partitions[d];
if (parts == null) {
// Never mark any on constant dimensions.
continue;
}
final long[] marked = markers[d] = BitsUtil.zero(binCount);
int card = 0;
while (card < dim - 1) {
// Find bin with largest support, test only the dimensions that were not
// previously marked.
int bestBin = chiSquaredUniformTest(parts, marked, card);
if (bestBin < 0) {
// Uniform
break;
}
BitsUtil.setI(marked, bestBin);
card++;
}
if (LOG.isDebugging()) {
LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
}
}
if (stepProgress != null) {
stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
}
ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
if (stepProgress != null) {
stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
}
ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
if (stepProgress != null) {
stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
}
clusterCores = pruneRedundantClusterCores(clusterCores);
if (LOG.isVerbose()) {
LOG.verbose("Number of cluster cores found: " + clusterCores.size());
}
if (clusterCores.isEmpty()) {
LOG.setCompleted(stepProgress);
Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
return c;
}
if (stepProgress != null) {
stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
}
// Track objects not assigned to any cluster:
ModifiableDBIDs noise = DBIDUtil.newHashSet();
WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
int k = clusterCores.size();
List<MultivariateGaussianModel> models = new ArrayList<>(k);
computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
// Initial estimate of covariances, to assign noise objects
EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
assignUnassigned(relation, probClusterIGivenX, models, noise);
double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
for (int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
final double emOld = emNew;
EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
// reassign probabilities
emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
if (LOG.isVerbose()) {
LOG.verbose("iteration " + it + " - expectation value: " + emNew);
}
if ((emNew - emOld) <= emDelta) {
break;
}
}
if (stepProgress != null) {
stepProgress.beginStep(6, "Generating hard clustering.", LOG);
}
// Create a hard clustering, making sure each data point only is part of one
// cluster, based on the best match from the membership matrix.
ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
if (stepProgress != null) {
stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
}
// Outlier detection. Remove points from clusters that have a Mahalanobis
// distance larger than the critical value of the ChiSquare distribution.
findOutliers(relation, models, clusterCandidates, noise);
if (stepProgress != null) {
stepProgress.beginStep(8, "Removing empty clusters.", LOG);
}
// Remove near-empty clusters.
for (Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext(); ) {
ClusterCandidate cand = it.next();
final int size = cand.ids.size();
if (size < minClusterSize) {
if (size > 0) {
noise.addDBIDs(cand.ids);
}
it.remove();
}
}
if (LOG.isVerbose()) {
LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
}
if (stepProgress != null) {
stepProgress.beginStep(9, "Generating final result.", LOG);
}
// Generate final output.
Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
for (int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
ClusterCandidate candidate = clusterCandidates.get(cluster);
CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
}
LOG.verbose("Noise size: " + noise.size());
if (noise.size() > 0) {
result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
}
LOG.ensureCompleted(stepProgress);
return result;
}
use of de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel in project elki by elki-project.
the class P3C method computeFuzzyMembership.
/**
* Computes a fuzzy membership with the weights based on which cluster cores
* each data point is part of.
*
* @param relation Data relation
* @param clusterCores the cluster cores.
* @param unassigned set to which to add unassigned points.
* @param probClusterIGivenX Membership probabilities.
* @param models Cluster models.
* @param dim Dimensionality
*/
private void computeFuzzyMembership(Relation<V> relation, ArrayList<Signature> clusterCores, ModifiableDBIDs unassigned, WritableDataStore<double[]> probClusterIGivenX, List<MultivariateGaussianModel> models, int dim) {
final int n = relation.size();
// Weight of each point
final double pweight = 1. / n;
final int k = clusterCores.size();
double[] clusterWeights = new double[k];
for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
int count = 0;
double[] weights = new double[k];
for (int cluster = 0; cluster < k; ++cluster) {
if (clusterCores.get(cluster).ids.contains(iter)) {
weights[cluster] = 1.;
++count;
}
}
// Set value(s) in membership matrix.
if (count > 0) {
// Rescale.
VMath.timesEquals(weights, 1. / count);
VMath.plusTimesEquals(clusterWeights, weights, pweight);
} else {
// Does not match any cluster, mark it.
unassigned.add(iter);
}
probClusterIGivenX.put(iter, weights);
}
for (int i = 0; i < k; i++) {
models.add(new MultivariateGaussianModel(clusterWeights[i], new double[dim]));
}
}
use of de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel in project elki by elki-project.
the class P3C method assignUnassigned.
/**
* Assign unassigned objects to best candidate based on shortest Mahalanobis
* distance.
*
* @param relation Data relation
* @param probClusterIGivenX fuzzy membership matrix.
* @param models Cluster models.
* @param unassigned the list of points not yet assigned.
*/
private void assignUnassigned(Relation<V> relation, WritableDataStore<double[]> probClusterIGivenX, List<MultivariateGaussianModel> models, ModifiableDBIDs unassigned) {
if (unassigned.size() == 0) {
return;
}
final int k = models.size();
double pweight = 1. / relation.size();
// Rescale weights, to take unassigned points into account:
for (EMClusterModel<?> m : models) {
m.setWeight(m.getWeight() * (relation.size() - unassigned.size()) * pweight);
}
// Assign noise objects, increase weights accordingly.
for (DBIDIter iter = unassigned.iter(); iter.valid(); iter.advance()) {
// Find the best matching known cluster core using the Mahalanobis
// distance.
V v = relation.get(iter);
int bestCluster = -1;
MultivariateGaussianModel bestModel = null;
double minDistance = Double.POSITIVE_INFINITY;
int c = 0;
for (MultivariateGaussianModel model : models) {
final double distance = model.mahalanobisDistance(v);
if (distance < minDistance) {
minDistance = distance;
bestCluster = c;
bestModel = model;
}
c++;
}
// Assign to best core.
double[] weights = new double[k];
weights[bestCluster] = 1.;
bestModel.setWeight(bestModel.getWeight() + pweight);
probClusterIGivenX.put(iter, weights);
}
// Clear the list of unassigned objects.
unassigned.clear();
}
use of de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel in project elki by elki-project.
the class P3C method findOutliers.
/**
* Performs outlier detection by testing the Mahalanobis distance of each
* point in a cluster against the critical value of the ChiSquared
* distribution with as many degrees of freedom as the cluster has relevant
* attributes.
*
* @param relation Data relation
* @param models Cluster models
* @param clusterCandidates the list of clusters to check.
* @param noise the set to which to add points deemed outliers.
*/
private void findOutliers(Relation<V> relation, List<MultivariateGaussianModel> models, ArrayList<ClusterCandidate> clusterCandidates, ModifiableDBIDs noise) {
Iterator<MultivariateGaussianModel> it = models.iterator();
for (int c = 0; it.hasNext(); c++) {
MultivariateGaussianModel model = it.next();
final ClusterCandidate candidate = clusterCandidates.get(c);
final int dof = BitsUtil.cardinality(candidate.dimensions);
final double threshold = ChiSquaredDistribution.quantile(1 - alpha, dof);
for (DBIDMIter iter = candidate.ids.iter(); iter.valid(); iter.advance()) {
final double distance = model.mahalanobisDistance(relation.get(iter));
if (distance >= threshold) {
// Outlier, remove it and add it to the outlier set.
noise.add(iter);
iter.remove();
}
}
}
}
Aggregations