use of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.CASHInterval in project elki by elki-project.
the class CASH method initHeap.
/**
* Initializes the heap with the root intervals.
*
* @param heap the heap to be initialized
* @param relation the database storing the parameterization functions
* @param dim the dimensionality of the database
* @param ids the ids of the database
*/
private void initHeap(ObjectHeap<IntegerPriorityObject<CASHInterval>> heap, Relation<ParameterizationFunction> relation, int dim, DBIDs ids) {
CASHIntervalSplit split = new CASHIntervalSplit(relation, minPts);
// determine minimum and maximum function value of all functions
double[] minMax = determineMinMaxDistance(relation, dim);
double d_min = minMax[0], d_max = minMax[1];
double dIntervalLength = d_max - d_min;
int numDIntervals = (int) FastMath.ceil(dIntervalLength / jitter);
double dIntervalSize = dIntervalLength / numDIntervals;
double[] d_mins = new double[numDIntervals], d_maxs = new double[numDIntervals];
if (LOG.isVerbose()) {
LOG.verbose(//
new StringBuilder().append("d_min ").append(d_min).append("\nd_max ").append(//
d_max).append("\nnumDIntervals ").append(//
numDIntervals).append("\ndIntervalSize ").append(dIntervalSize).toString());
}
// alpha intervals
double[] alphaMin = new double[dim - 1], alphaMax = new double[dim - 1];
Arrays.fill(alphaMax, Math.PI);
for (int i = 0; i < numDIntervals; i++) {
d_mins[i] = (i == 0) ? d_min : d_maxs[i - 1];
d_maxs[i] = (i < numDIntervals - 1) ? d_mins[i] + dIntervalSize : d_max - d_mins[i];
HyperBoundingBox alphaInterval = new HyperBoundingBox(alphaMin, alphaMax);
ModifiableDBIDs intervalIDs = split.determineIDs(ids, alphaInterval, d_mins[i], d_maxs[i]);
if (intervalIDs != null && intervalIDs.size() >= minPts) {
CASHInterval rootInterval = new CASHInterval(alphaMin, alphaMax, split, intervalIDs, -1, 0, d_mins[i], d_maxs[i]);
heap.add(new IntegerPriorityObject<>(rootInterval.priority(), rootInterval));
}
}
if (LOG.isDebuggingFiner()) {
LOG.debugFiner(new StringBuilder().append("heap.size: ").append(heap.size()).toString());
}
}
use of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.CASHInterval in project elki by elki-project.
the class CASH method doDetermineNextIntervalAtMaxLevel.
/**
* Recursive helper method to determine the next ''best'' interval at maximum
* level, i.e. the next interval containing the most unprocessed objects
*
* @param heap the heap storing the intervals
* @return the next ''best'' interval at maximum level
*/
private CASHInterval doDetermineNextIntervalAtMaxLevel(ObjectHeap<IntegerPriorityObject<CASHInterval>> heap) {
CASHInterval interval = heap.poll().getObject();
int dim = interval.getDimensionality();
while (true) {
// max level is reached
if (interval.getLevel() >= maxLevel && interval.getMaxSplitDimension() == (dim - 1)) {
return interval;
}
if (heap.size() % 10000 == 0 && LOG.isVerbose()) {
LOG.verbose("heap size " + heap.size());
}
if (heap.size() >= 40000) {
LOG.warning("Heap size > 40.000! Stopping.");
heap.clear();
return null;
}
if (LOG.isDebuggingFiner()) {
LOG.debugFiner("split " + interval.toString() + " " + interval.getLevel() + "-" + interval.getMaxSplitDimension());
}
interval.split();
// noise
if (!interval.hasChildren()) {
return null;
}
CASHInterval bestInterval;
if (interval.getLeftChild() != null && interval.getRightChild() != null) {
int comp = interval.getLeftChild().compareTo(interval.getRightChild());
if (comp < 0) {
bestInterval = interval.getRightChild();
heap.add(new IntegerPriorityObject<>(interval.getLeftChild().priority(), interval.getLeftChild()));
} else {
bestInterval = interval.getLeftChild();
heap.add(new IntegerPriorityObject<>(interval.getRightChild().priority(), interval.getRightChild()));
}
} else if (interval.getLeftChild() == null) {
bestInterval = interval.getRightChild();
} else {
bestInterval = interval.getLeftChild();
}
interval = bestInterval;
}
}
use of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.CASHInterval in project elki by elki-project.
the class CASH method doRun.
/**
* Runs the CASH algorithm on the specified database, this method is
* recursively called until only noise is left.
*
* @param relation the Relation to run the CASH algorithm on
* @param progress the progress object for verbose messages
* @return a mapping of subspace dimensionalities to clusters
*/
private Clustering<Model> doRun(Relation<ParameterizationFunction> relation, FiniteProgress progress) {
Clustering<Model> res = new Clustering<>("CASH clustering", "cash-clustering");
final int dim = dimensionality(relation);
// init heap
ObjectHeap<IntegerPriorityObject<CASHInterval>> heap = new ComparableMinHeap<>();
ModifiableDBIDs noiseIDs = DBIDUtil.newHashSet(relation.getDBIDs());
initHeap(heap, relation, dim, noiseIDs);
if (LOG.isVerbose()) {
LOG.verbose(new StringBuilder().append("dim ").append(dim).append(" database.size ").append(relation.size()).toString());
}
// get the ''best'' d-dimensional intervals at max level
while (!heap.isEmpty()) {
CASHInterval interval = determineNextIntervalAtMaxLevel(heap);
if (LOG.isVerbose()) {
LOG.verbose("next interval in dim " + dim + ": " + interval);
}
// only noise left
if (interval == null) {
break;
}
// do a dim-1 dimensional run
ModifiableDBIDs clusterIDs = DBIDUtil.newHashSet();
if (dim > minDim + 1) {
ModifiableDBIDs ids;
double[][] basis_dim_minus_1;
if (adjust) {
ids = DBIDUtil.newHashSet();
basis_dim_minus_1 = runDerivator(relation, dim, interval, ids);
} else {
ids = interval.getIDs();
basis_dim_minus_1 = determineBasis(SpatialUtil.centroid(interval));
}
if (ids.size() != 0) {
MaterializedRelation<ParameterizationFunction> db = buildDB(dim, basis_dim_minus_1, ids, relation);
// add result of dim-1 to this result
Clustering<Model> res_dim_minus_1 = doRun(db, progress);
for (Cluster<Model> cluster : res_dim_minus_1.getAllClusters()) {
res.addToplevelCluster(cluster);
noiseIDs.removeDBIDs(cluster.getIDs());
clusterIDs.addDBIDs(cluster.getIDs());
processedIDs.addDBIDs(cluster.getIDs());
}
}
} else // dim == minDim
{
LinearEquationSystem les = runDerivator(relation, dim - 1, interval.getIDs());
Cluster<Model> c = new Cluster<Model>(interval.getIDs(), new LinearEquationModel(les));
res.addToplevelCluster(c);
noiseIDs.removeDBIDs(interval.getIDs());
clusterIDs.addDBIDs(interval.getIDs());
processedIDs.addDBIDs(interval.getIDs());
}
// Rebuild heap
ArrayList<IntegerPriorityObject<CASHInterval>> heapVector = new ArrayList<>(heap.size());
for (ObjectHeap.UnsortedIter<IntegerPriorityObject<CASHInterval>> iter = heap.unsortedIter(); iter.valid(); iter.advance()) {
heapVector.add(iter.get());
}
heap.clear();
for (IntegerPriorityObject<CASHInterval> pair : heapVector) {
CASHInterval currentInterval = pair.getObject();
currentInterval.removeIDs(clusterIDs);
if (currentInterval.getIDs().size() >= minPts) {
heap.add(new IntegerPriorityObject<>(currentInterval.priority(), currentInterval));
}
}
if (progress != null) {
progress.setProcessed(processedIDs.size(), LOG);
}
}
// put noise to clusters
if (!noiseIDs.isEmpty()) {
if (dim == noiseDim) {
res.addToplevelCluster(new Cluster<Model>(noiseIDs, true, ClusterModel.CLUSTER));
processedIDs.addDBIDs(noiseIDs);
} else if (noiseIDs.size() >= minPts) {
LinearEquationSystem les = runDerivator(fulldatabase, dim - 1, noiseIDs);
res.addToplevelCluster(new Cluster<Model>(noiseIDs, true, new LinearEquationModel(les)));
processedIDs.addDBIDs(noiseIDs);
}
}
if (LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append("noise fuer dim ").append(dim).append(": ").append(noiseIDs.size());
for (Cluster<Model> c : res.getAllClusters()) {
if (c.getModel() instanceof LinearEquationModel) {
msg.append("\n Cluster: Dim: ").append(((LinearEquationModel) c.getModel()).getLes().subspacedim());
} else {
msg.append("\n Cluster: ").append(c.getModel().getClass().getName());
}
msg.append(" size: ").append(c.size());
}
LOG.debugFine(msg.toString());
}
if (progress != null) {
progress.setProcessed(processedIDs.size(), LOG);
}
return res;
}
Aggregations