use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class P3C method run.
/**
* Performs the P3C algorithm on the given Database.
*/
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
final int dim = RelationUtil.dimensionality(relation);
// Overall progress.
StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
if (stepProgress != null) {
stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
}
// Desired number of bins, as per Sturge:
final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
// Perform 1-dimensional projections, and split into bins.
SetDBIDs[][] partitions = partitionData(relation, binCount);
if (stepProgress != null) {
stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
}
// Set markers for each attribute until they're all deemed uniform.
final long[][] markers = new long[dim][];
for (int d = 0; d < dim; d++) {
final SetDBIDs[] parts = partitions[d];
if (parts == null) {
// Never mark any on constant dimensions.
continue;
}
final long[] marked = markers[d] = BitsUtil.zero(binCount);
int card = 0;
while (card < dim - 1) {
// Find bin with largest support, test only the dimensions that were not
// previously marked.
int bestBin = chiSquaredUniformTest(parts, marked, card);
if (bestBin < 0) {
// Uniform
break;
}
BitsUtil.setI(marked, bestBin);
card++;
}
if (LOG.isDebugging()) {
LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
}
}
if (stepProgress != null) {
stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
}
ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
if (stepProgress != null) {
stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
}
ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
if (stepProgress != null) {
stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
}
clusterCores = pruneRedundantClusterCores(clusterCores);
if (LOG.isVerbose()) {
LOG.verbose("Number of cluster cores found: " + clusterCores.size());
}
if (clusterCores.isEmpty()) {
LOG.setCompleted(stepProgress);
Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
return c;
}
if (stepProgress != null) {
stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
}
// Track objects not assigned to any cluster:
ModifiableDBIDs noise = DBIDUtil.newHashSet();
WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
int k = clusterCores.size();
List<MultivariateGaussianModel> models = new ArrayList<>(k);
computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
// Initial estimate of covariances, to assign noise objects
EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
assignUnassigned(relation, probClusterIGivenX, models, noise);
double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
for (int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
final double emOld = emNew;
EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
// reassign probabilities
emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
if (LOG.isVerbose()) {
LOG.verbose("iteration " + it + " - expectation value: " + emNew);
}
if ((emNew - emOld) <= emDelta) {
break;
}
}
if (stepProgress != null) {
stepProgress.beginStep(6, "Generating hard clustering.", LOG);
}
// Create a hard clustering, making sure each data point only is part of one
// cluster, based on the best match from the membership matrix.
ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
if (stepProgress != null) {
stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
}
// Outlier detection. Remove points from clusters that have a Mahalanobis
// distance larger than the critical value of the ChiSquare distribution.
findOutliers(relation, models, clusterCandidates, noise);
if (stepProgress != null) {
stepProgress.beginStep(8, "Removing empty clusters.", LOG);
}
// Remove near-empty clusters.
for (Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext(); ) {
ClusterCandidate cand = it.next();
final int size = cand.ids.size();
if (size < minClusterSize) {
if (size > 0) {
noise.addDBIDs(cand.ids);
}
it.remove();
}
}
if (LOG.isVerbose()) {
LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
}
if (stepProgress != null) {
stepProgress.beginStep(9, "Generating final result.", LOG);
}
// Generate final output.
Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
for (int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
ClusterCandidate candidate = clusterCandidates.get(cluster);
CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
}
LOG.verbose("Noise size: " + noise.size());
if (noise.size() > 0) {
result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
}
LOG.ensureCompleted(stepProgress);
return result;
}
use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class P3C method mergeSignatures.
/**
* Generates a merged signature of this and another one, where the other
* signature must be a 1-signature.
*
* @param first First signature.
* @param second Second signature, must be a 1-signature.
* @param numBins Number of bins per dimension.
* @return the merged signature, or null if the merge failed.
*/
protected Signature mergeSignatures(Signature first, Signature second, int numBins) {
int d2 = -1;
for (int i = 0; i < second.spec.length; i += 2) {
if (second.spec[i] >= 0) {
assert (d2 == -1) : "Merging with non-1-signature?!?";
d2 = i;
}
}
assert (d2 >= 0) : "Merging with empty signature?";
// Avoid generating redundant signatures.
if (first.spec[d2] >= 0) {
return null;
}
// Definition 3, Condition 1:
// True support:
final ModifiableDBIDs intersection = DBIDUtil.intersection(first.ids, second.ids);
final int support = intersection.size();
// Interval width, computed using selected number of bins / total bins
double width = (second.spec[d2 + 1] - second.spec[d2] + 1.) / (double) numBins;
// Expected size thus:
double expect = first.ids.size() * width;
if (support <= expect || support < minClusterSize) {
return null;
}
final double test = PoissonDistribution.rawProbability(support, expect);
if (poissonThreshold <= test) {
return null;
}
// Create merged signature.
int[] spec = first.spec.clone();
spec[d2] = second.spec[d2];
spec[d2 + 1] = second.spec[d2];
final Signature newsig = new Signature(spec, intersection);
if (LOG.isDebugging()) {
LOG.debug(newsig.toString());
}
return newsig;
}
use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class PROCLUS method computeBadMedoids.
/**
* Computes the bad medoids, where the medoid of a cluster with less than the
* specified threshold of objects is bad.
*
* @param m_current Current medoids
* @param clusters the clusters
* @param threshold the threshold
* @return the bad medoids
*/
private DBIDs computeBadMedoids(ArrayDBIDs m_current, ArrayList<PROCLUSCluster> clusters, int threshold) {
ModifiableDBIDs badMedoids = DBIDUtil.newHashSet(m_current.size());
int i = 0;
for (DBIDIter it = m_current.iter(); it.valid(); it.advance(), i++) {
PROCLUSCluster c_i = clusters.get(i);
if (c_i == null || c_i.objectIDs.size() < threshold) {
badMedoids.add(it);
}
}
return badMedoids;
}
use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class PROCLUS method assignPoints.
/**
* Assigns the objects to the clusters.
*
* @param m_current Current centers
* @param dimensions set of correlated dimensions for each medoid of the
* cluster
* @param database the database containing the objects
* @return the assignments of the object to the clusters
*/
private ArrayList<PROCLUSCluster> assignPoints(ArrayDBIDs m_current, long[][] dimensions, Relation<V> database) {
ModifiableDBIDs[] clusterIDs = new ModifiableDBIDs[dimensions.length];
for (int i = 0; i < m_current.size(); i++) {
clusterIDs[i] = DBIDUtil.newHashSet();
}
DBIDArrayIter m_i = m_current.iter();
for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
V p = database.get(it);
double minDist = Double.NaN;
int best = -1, i = 0;
for (m_i.seek(0); m_i.valid(); m_i.advance(), i++) {
V m = database.get(m_i);
double currentDist = manhattanSegmentalDistance(p, m, dimensions[i]);
if (!(minDist <= currentDist)) {
minDist = currentDist;
best = i;
}
}
// add p to cluster with mindist
assert best >= 0;
clusterIDs[best].add(it);
}
ArrayList<PROCLUSCluster> clusters = new ArrayList<>(m_current.size());
for (int i = 0; i < dimensions.length; i++) {
ModifiableDBIDs objectIDs = clusterIDs[i];
if (!objectIDs.isEmpty()) {
long[] clusterDimensions = dimensions[i];
double[] centroid = Centroid.make(database, objectIDs).getArrayRef();
clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid));
} else {
clusters.add(null);
}
}
if (LOG.isDebugging()) {
LOG.debugFine(new StringBuilder().append("clusters ").append(clusters).toString());
}
return clusters;
}
use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class PROCLUS method finalAssignment.
/**
* Refinement step to assign the objects to the final clusters.
*
* @param dimensions pair containing the centroid and the set of correlated
* dimensions for the centroid
* @param database the database containing the objects
* @return the assignments of the object to the clusters
*/
private List<PROCLUSCluster> finalAssignment(List<Pair<double[], long[]>> dimensions, Relation<V> database) {
Map<Integer, ModifiableDBIDs> clusterIDs = new HashMap<>();
for (int i = 0; i < dimensions.size(); i++) {
clusterIDs.put(i, DBIDUtil.newHashSet());
}
for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
V p = database.get(it);
double minDist = Double.POSITIVE_INFINITY;
int best = -1;
for (int i = 0; i < dimensions.size(); i++) {
Pair<double[], long[]> pair_i = dimensions.get(i);
double currentDist = manhattanSegmentalDistance(p, pair_i.first, pair_i.second);
if (best < 0 || currentDist < minDist) {
minDist = currentDist;
best = i;
}
}
// add p to cluster with mindist
assert minDist >= 0.;
clusterIDs.get(best).add(it);
}
List<PROCLUSCluster> clusters = new ArrayList<>();
for (int i = 0; i < dimensions.size(); i++) {
ModifiableDBIDs objectIDs = clusterIDs.get(i);
if (!objectIDs.isEmpty()) {
long[] clusterDimensions = dimensions.get(i).second;
double[] centroid = Centroid.make(database, objectIDs).getArrayRef();
clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid));
}
}
if (LOG.isDebugging()) {
LOG.debugFine(new StringBuilder().append("clusters ").append(clusters).toString());
}
return clusters;
}
Aggregations