use of de.lmu.ifi.dbs.elki.data.Subspace in project elki by elki-project.
the class DiSH method isParent.
/**
* Returns true, if the specified parent cluster is a parent of one child of
* the children clusters.
*
* @param relation the database containing the objects
* @param parent the parent to be tested
* @param iter the list of children to be tested
* @param db_dim Database dimensionality
* @return true, if the specified parent cluster is a parent of one child of
* the children clusters, false otherwise
*/
private boolean isParent(Relation<V> relation, Cluster<SubspaceModel> parent, It<Cluster<SubspaceModel>> iter, int db_dim) {
Subspace s_p = parent.getModel().getSubspace();
NumberVector parent_centroid = ProjectedCentroid.make(s_p.getDimensions(), relation, parent.getIDs());
int subspaceDim_parent = db_dim - s_p.dimensionality();
for (; iter.valid(); iter.advance()) {
Cluster<SubspaceModel> child = iter.get();
Subspace s_c = child.getModel().getSubspace();
NumberVector child_centroid = ProjectedCentroid.make(s_c.getDimensions(), relation, child.getIDs());
long[] commonPreferenceVector = BitsUtil.andCMin(s_p.getDimensions(), s_c.getDimensions());
int subspaceDim = subspaceDimensionality(parent_centroid, child_centroid, s_p.getDimensions(), s_c.getDimensions(), commonPreferenceVector);
if (subspaceDim == subspaceDim_parent) {
return true;
}
}
return false;
}
use of de.lmu.ifi.dbs.elki.data.Subspace in project elki by elki-project.
the class DiSH method sortClusters.
/**
* Returns a sorted list of the clusters w.r.t. the subspace dimensionality in
* descending order.
*
* @param relation the database storing the objects
* @param clustersMap the mapping of bits sets to clusters
* @return a sorted list of the clusters
*/
private List<Cluster<SubspaceModel>> sortClusters(Relation<V> relation, Object2ObjectMap<long[], List<ArrayModifiableDBIDs>> clustersMap) {
final int db_dim = RelationUtil.dimensionality(relation);
// int num = 1;
List<Cluster<SubspaceModel>> clusters = new ArrayList<>();
for (long[] pv : clustersMap.keySet()) {
List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv);
for (int i = 0; i < parallelClusters.size(); i++) {
ArrayModifiableDBIDs c = parallelClusters.get(i);
Cluster<SubspaceModel> cluster = new Cluster<>(c);
cluster.setModel(new SubspaceModel(new Subspace(pv), Centroid.make(relation, c).getArrayRef()));
String subspace = BitsUtil.toStringLow(cluster.getModel().getSubspace().getDimensions(), db_dim);
if (parallelClusters.size() > 1) {
cluster.setName("Cluster_" + subspace + "_" + i);
} else {
cluster.setName("Cluster_" + subspace);
}
clusters.add(cluster);
}
}
// sort the clusters w.r.t. lambda
Comparator<Cluster<SubspaceModel>> comparator = new Comparator<Cluster<SubspaceModel>>() {
@Override
public int compare(Cluster<SubspaceModel> c1, Cluster<SubspaceModel> c2) {
return c2.getModel().getSubspace().dimensionality() - c1.getModel().getSubspace().dimensionality();
}
};
Collections.sort(clusters, comparator);
return clusters;
}
use of de.lmu.ifi.dbs.elki.data.Subspace in project elki by elki-project.
the class P3C method run.
/**
* Performs the P3C algorithm on the given Database.
*/
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
final int dim = RelationUtil.dimensionality(relation);
// Overall progress.
StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
if (stepProgress != null) {
stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
}
// Desired number of bins, as per Sturge:
final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
// Perform 1-dimensional projections, and split into bins.
SetDBIDs[][] partitions = partitionData(relation, binCount);
if (stepProgress != null) {
stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
}
// Set markers for each attribute until they're all deemed uniform.
final long[][] markers = new long[dim][];
for (int d = 0; d < dim; d++) {
final SetDBIDs[] parts = partitions[d];
if (parts == null) {
// Never mark any on constant dimensions.
continue;
}
final long[] marked = markers[d] = BitsUtil.zero(binCount);
int card = 0;
while (card < dim - 1) {
// Find bin with largest support, test only the dimensions that were not
// previously marked.
int bestBin = chiSquaredUniformTest(parts, marked, card);
if (bestBin < 0) {
// Uniform
break;
}
BitsUtil.setI(marked, bestBin);
card++;
}
if (LOG.isDebugging()) {
LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
}
}
if (stepProgress != null) {
stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
}
ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
if (stepProgress != null) {
stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
}
ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
if (stepProgress != null) {
stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
}
clusterCores = pruneRedundantClusterCores(clusterCores);
if (LOG.isVerbose()) {
LOG.verbose("Number of cluster cores found: " + clusterCores.size());
}
if (clusterCores.isEmpty()) {
LOG.setCompleted(stepProgress);
Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
return c;
}
if (stepProgress != null) {
stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
}
// Track objects not assigned to any cluster:
ModifiableDBIDs noise = DBIDUtil.newHashSet();
WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
int k = clusterCores.size();
List<MultivariateGaussianModel> models = new ArrayList<>(k);
computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
// Initial estimate of covariances, to assign noise objects
EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
assignUnassigned(relation, probClusterIGivenX, models, noise);
double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
for (int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
final double emOld = emNew;
EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
// reassign probabilities
emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
if (LOG.isVerbose()) {
LOG.verbose("iteration " + it + " - expectation value: " + emNew);
}
if ((emNew - emOld) <= emDelta) {
break;
}
}
if (stepProgress != null) {
stepProgress.beginStep(6, "Generating hard clustering.", LOG);
}
// Create a hard clustering, making sure each data point only is part of one
// cluster, based on the best match from the membership matrix.
ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
if (stepProgress != null) {
stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
}
// Outlier detection. Remove points from clusters that have a Mahalanobis
// distance larger than the critical value of the ChiSquare distribution.
findOutliers(relation, models, clusterCandidates, noise);
if (stepProgress != null) {
stepProgress.beginStep(8, "Removing empty clusters.", LOG);
}
// Remove near-empty clusters.
for (Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext(); ) {
ClusterCandidate cand = it.next();
final int size = cand.ids.size();
if (size < minClusterSize) {
if (size > 0) {
noise.addDBIDs(cand.ids);
}
it.remove();
}
}
if (LOG.isVerbose()) {
LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
}
if (stepProgress != null) {
stepProgress.beginStep(9, "Generating final result.", LOG);
}
// Generate final output.
Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
for (int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
ClusterCandidate candidate = clusterCandidates.get(cluster);
CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
}
LOG.verbose("Noise size: " + noise.size());
if (noise.size() > 0) {
result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
}
LOG.ensureCompleted(stepProgress);
return result;
}
use of de.lmu.ifi.dbs.elki.data.Subspace in project elki by elki-project.
the class DOC method makeCluster.
/**
* Utility method to create a subspace cluster from a list of DBIDs and the
* relevant attributes.
*
* @param relation to compute a centroid.
* @param C the cluster points.
* @param D the relevant dimensions.
* @return an object representing the subspace cluster.
*/
protected Cluster<SubspaceModel> makeCluster(Relation<V> relation, DBIDs C, long[] D) {
// copy, also to lose distance values!
DBIDs ids = DBIDUtil.newHashSet(C);
Cluster<SubspaceModel> cluster = new Cluster<>(ids);
cluster.setModel(new SubspaceModel(new Subspace(D), Centroid.make(relation, ids).getArrayRef()));
return cluster;
}
use of de.lmu.ifi.dbs.elki.data.Subspace in project elki by elki-project.
the class DOC method run.
/**
* Performs the DOC or FastDOC (as configured) algorithm on the given
* Database.
*
* This will run exhaustively, i.e. run DOC until no clusters are found
* anymore / the database size has shrunk below the threshold for minimum
* cluster size.
*
* @param database Database
* @param relation Data relation
*/
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
// Dimensionality of our set.
final int d = RelationUtil.dimensionality(relation);
// Get available DBIDs as a set we can remove items from.
ArrayModifiableDBIDs S = DBIDUtil.newArray(relation.getDBIDs());
// Precompute values as described in Figure 2.
double r = Math.abs(FastMath.log(d + d) / FastMath.log(beta * .5));
// Outer loop count.
int n = (int) (2. / alpha);
// Inner loop count.
int m = (int) (FastMath.pow(2. / alpha, r) * FastMath.log(4));
// TODO: This should only apply for FastDOC.
m = Math.min(m, Math.min(1000000, d * d));
// Minimum size for a cluster for it to be accepted.
int minClusterSize = (int) (alpha * S.size());
// List of all clusters we found.
Clustering<SubspaceModel> result = new Clustering<>("DOC Clusters", "DOC");
// Inform the user about the number of actual clusters found so far.
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;
// of points is empty.
while (S.size() > minClusterSize) {
Cluster<SubspaceModel> C = runDOC(database, relation, S, d, n, m, (int) r, minClusterSize);
if (C == null) {
// Stop trying if we couldn't find a cluster.
break;
}
// Found a cluster, remember it, remove its points from the set.
result.addToplevelCluster(C);
// Remove all points of the cluster from the set and continue.
S.removeDBIDs(C.getIDs());
if (cprogress != null) {
cprogress.setProcessed(result.getAllClusters().size(), LOG);
}
}
// Add the remainder as noise.
if (S.size() > 0) {
long[] alldims = BitsUtil.ones(d);
result.addToplevelCluster(new Cluster<>(S, true, new SubspaceModel(new Subspace(alldims), Centroid.make(relation, S).getArrayRef())));
}
LOG.setCompleted(cprogress);
return result;
}
Aggregations