use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class CASH method doRun.
/**
* Runs the CASH algorithm on the specified database, this method is
* recursively called until only noise is left.
*
* @param relation the Relation to run the CASH algorithm on
* @param progress the progress object for verbose messages
* @return a mapping of subspace dimensionalities to clusters
*/
private Clustering<Model> doRun(Relation<ParameterizationFunction> relation, FiniteProgress progress) {
Clustering<Model> res = new Clustering<>("CASH clustering", "cash-clustering");
final int dim = dimensionality(relation);
// init heap
ObjectHeap<IntegerPriorityObject<CASHInterval>> heap = new ComparableMinHeap<>();
ModifiableDBIDs noiseIDs = DBIDUtil.newHashSet(relation.getDBIDs());
initHeap(heap, relation, dim, noiseIDs);
if (LOG.isVerbose()) {
LOG.verbose(new StringBuilder().append("dim ").append(dim).append(" database.size ").append(relation.size()).toString());
}
// get the ''best'' d-dimensional intervals at max level
while (!heap.isEmpty()) {
CASHInterval interval = determineNextIntervalAtMaxLevel(heap);
if (LOG.isVerbose()) {
LOG.verbose("next interval in dim " + dim + ": " + interval);
}
// only noise left
if (interval == null) {
break;
}
// do a dim-1 dimensional run
ModifiableDBIDs clusterIDs = DBIDUtil.newHashSet();
if (dim > minDim + 1) {
ModifiableDBIDs ids;
double[][] basis_dim_minus_1;
if (adjust) {
ids = DBIDUtil.newHashSet();
basis_dim_minus_1 = runDerivator(relation, dim, interval, ids);
} else {
ids = interval.getIDs();
basis_dim_minus_1 = determineBasis(SpatialUtil.centroid(interval));
}
if (ids.size() != 0) {
MaterializedRelation<ParameterizationFunction> db = buildDB(dim, basis_dim_minus_1, ids, relation);
// add result of dim-1 to this result
Clustering<Model> res_dim_minus_1 = doRun(db, progress);
for (Cluster<Model> cluster : res_dim_minus_1.getAllClusters()) {
res.addToplevelCluster(cluster);
noiseIDs.removeDBIDs(cluster.getIDs());
clusterIDs.addDBIDs(cluster.getIDs());
processedIDs.addDBIDs(cluster.getIDs());
}
}
} else // dim == minDim
{
LinearEquationSystem les = runDerivator(relation, dim - 1, interval.getIDs());
Cluster<Model> c = new Cluster<Model>(interval.getIDs(), new LinearEquationModel(les));
res.addToplevelCluster(c);
noiseIDs.removeDBIDs(interval.getIDs());
clusterIDs.addDBIDs(interval.getIDs());
processedIDs.addDBIDs(interval.getIDs());
}
// Rebuild heap
ArrayList<IntegerPriorityObject<CASHInterval>> heapVector = new ArrayList<>(heap.size());
for (ObjectHeap.UnsortedIter<IntegerPriorityObject<CASHInterval>> iter = heap.unsortedIter(); iter.valid(); iter.advance()) {
heapVector.add(iter.get());
}
heap.clear();
for (IntegerPriorityObject<CASHInterval> pair : heapVector) {
CASHInterval currentInterval = pair.getObject();
currentInterval.removeIDs(clusterIDs);
if (currentInterval.getIDs().size() >= minPts) {
heap.add(new IntegerPriorityObject<>(currentInterval.priority(), currentInterval));
}
}
if (progress != null) {
progress.setProcessed(processedIDs.size(), LOG);
}
}
// put noise to clusters
if (!noiseIDs.isEmpty()) {
if (dim == noiseDim) {
res.addToplevelCluster(new Cluster<Model>(noiseIDs, true, ClusterModel.CLUSTER));
processedIDs.addDBIDs(noiseIDs);
} else if (noiseIDs.size() >= minPts) {
LinearEquationSystem les = runDerivator(fulldatabase, dim - 1, noiseIDs);
res.addToplevelCluster(new Cluster<Model>(noiseIDs, true, new LinearEquationModel(les)));
processedIDs.addDBIDs(noiseIDs);
}
}
if (LOG.isDebugging()) {
StringBuilder msg = new StringBuilder();
msg.append("noise fuer dim ").append(dim).append(": ").append(noiseIDs.size());
for (Cluster<Model> c : res.getAllClusters()) {
if (c.getModel() instanceof LinearEquationModel) {
msg.append("\n Cluster: Dim: ").append(((LinearEquationModel) c.getModel()).getLes().subspacedim());
} else {
msg.append("\n Cluster: ").append(c.getModel().getClass().getName());
}
msg.append(" size: ").append(c.size());
}
LOG.debugFine(msg.toString());
}
if (progress != null) {
progress.setProcessed(processedIDs.size(), LOG);
}
return res;
}
use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class CASHInterval method split.
/**
* Splits this interval into 2 children.
*/
public void split() {
if (hasChildren()) {
return;
}
final boolean issplit = (maxSplitDimension >= (getDimensionality() - 1));
final int childLevel = issplit ? level + 1 : level;
final int splitDim = issplit ? 0 : maxSplitDimension + 1;
final double splitPoint = getMin(splitDim) + (getMax(splitDim) - getMin(splitDim)) * .5;
// left and right child
for (int i = 0; i < 2; i++) {
// clone
double[] min = SpatialUtil.getMin(this);
// clone
double[] max = SpatialUtil.getMax(this);
// right child
if (i == 0) {
min[splitDim] = splitPoint;
} else // left child
{
max[splitDim] = splitPoint;
}
ModifiableDBIDs childIDs = split.determineIDs(getIDs(), new HyperBoundingBox(min, max), d_min, d_max);
if (childIDs != null) {
// right child
if (i == 0) {
rightChild = new CASHInterval(min, max, split, childIDs, splitDim, childLevel, d_min, d_max);
} else // left child
{
leftChild = new CASHInterval(min, max, split, childIDs, splitDim, childLevel, d_min, d_max);
}
}
}
if (LOG.isDebuggingFine()) {
StringBuilder msg = new StringBuilder();
msg.append("Child level ").append(childLevel).append(", split Dim ").append(splitDim);
if (leftChild != null) {
msg.append("\nleft ").append(leftChild);
}
if (rightChild != null) {
msg.append("\nright ").append(rightChild);
}
LOG.fine(msg.toString());
}
}
use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class CASHIntervalSplit method determineIDs.
/**
* Determines the ids belonging to the given interval, i.e. the
* parameterization functions falling within the interval.
*
* @param superSetIDs a superset of the ids to be determined
* @param interval the hyper bounding box defining the interval of alpha
* values
* @param d_min the minimum distance value for the interval
* @param d_max the maximum distance value for the interval
* @return the ids belonging to the given interval, if the number ids of
* exceeds minPts, null otherwise
*/
public ModifiableDBIDs determineIDs(DBIDs superSetIDs, HyperBoundingBox interval, double d_min, double d_max) {
StringBuilder msg = LOG.isDebugging() ? new StringBuilder() : null;
if (msg != null) {
msg.append("interval ").append(interval);
}
ModifiableDBIDs childIDs = DBIDUtil.newHashSet(superSetIDs.size());
Map<DBID, Double> minima = f_minima.get(interval);
Map<DBID, Double> maxima = f_maxima.get(interval);
if (minima == null || maxima == null) {
minima = new HashMap<>();
f_minima.put(interval, minima);
maxima = new HashMap<>();
f_maxima.put(interval, maxima);
}
for (DBIDIter iter = superSetIDs.iter(); iter.valid(); iter.advance()) {
DBID id = DBIDUtil.deref(iter);
Double f_min = minima.get(id);
Double f_max = maxima.get(id);
if (f_min == null) {
ParameterizationFunction f = database.get(id);
HyperBoundingBox minMax = f.determineAlphaMinMax(interval);
f_min = f.function(SpatialUtil.getMin(minMax));
f_max = f.function(SpatialUtil.getMax(minMax));
minima.put(id, f_min);
maxima.put(id, f_max);
}
if (msg != null) {
msg.append("\n\nf_min ").append(f_min);
msg.append("\nf_max ").append(f_max);
msg.append("\nd_min ").append(d_min);
msg.append("\nd_max ").append(d_max);
}
if (f_min - f_max > ParameterizationFunction.DELTA) {
throw new IllegalArgumentException("Houston, we have a problem: f_min > f_max! " + "\nf_min[" + FormatUtil.format(SpatialUtil.centroid(interval)) + "] = " + f_min + "\nf_max[" + FormatUtil.format(SpatialUtil.centroid(interval)) + "] = " + f_max + "\nf " + database.get(id));
}
if (f_min <= d_max && f_max >= d_min) {
childIDs.add(id);
if (msg != null) {
msg.append("\nid ").append(id).append(" appended");
}
} else {
if (msg != null) {
msg.append("\nid ").append(id).append(" NOT appended");
}
}
}
if (msg != null) {
msg.append("\nchildIds ").append(childIDs.size());
LOG.debugFine(msg.toString());
}
if (childIDs.size() < minPts) {
return null;
} else {
return childIDs;
}
}
use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class SimpleCOP method run.
public OutlierResult run(Database database, Relation<V> data) throws IllegalStateException {
KNNQuery<V> knnQuery = QueryUtil.getKNNQuery(data, getDistanceFunction(), k + 1);
DBIDs ids = data.getDBIDs();
WritableDoubleDataStore cop_score = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
WritableDataStore<double[]> cop_err_v = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, double[].class);
WritableDataStore<double[][]> cop_datav = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, double[][].class);
WritableIntegerDataStore cop_dim = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, -1);
WritableDataStore<CorrelationAnalysisSolution<?>> cop_sol = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, CorrelationAnalysisSolution.class);
{
// compute neighbors of each db object
FiniteProgress progressLocalPCA = LOG.isVerbose() ? new FiniteProgress("Correlation Outlier Probabilities", data.size(), LOG) : null;
double sqrt2 = MathUtil.SQRT2;
for (DBIDIter id = data.iterDBIDs(); id.valid(); id.advance()) {
KNNList neighbors = knnQuery.getKNNForDBID(id, k + 1);
ModifiableDBIDs nids = DBIDUtil.newArray(neighbors);
nids.remove(id);
// TODO: do we want to use the query point as centroid?
CorrelationAnalysisSolution<V> depsol = dependencyDerivator.generateModel(data, nids);
double stddev = depsol.getStandardDeviation();
double distance = depsol.distance(data.get(id));
double prob = NormalDistribution.erf(distance / (stddev * sqrt2));
cop_score.putDouble(id, prob);
cop_err_v.put(id, times(depsol.errorVector(data.get(id)), -1));
double[][] datav = depsol.dataProjections(data.get(id));
cop_datav.put(id, datav);
cop_dim.putInt(id, depsol.getCorrelationDimensionality());
cop_sol.put(id, depsol);
LOG.incrementProcessed(progressLocalPCA);
}
LOG.ensureCompleted(progressLocalPCA);
}
// combine results.
DoubleRelation scoreResult = new MaterializedDoubleRelation("Original Correlation Outlier Probabilities", "origcop-outlier", cop_score, ids);
OutlierScoreMeta scoreMeta = new ProbabilisticOutlierScore();
OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
// extra results
result.addChildResult(new MaterializedRelation<>("Local Dimensionality", COP.COP_DIM, TypeUtil.INTEGER, cop_dim, ids));
result.addChildResult(new MaterializedRelation<>("Error vectors", COP.COP_ERRORVEC, TypeUtil.DOUBLE_ARRAY, cop_err_v, ids));
result.addChildResult(new MaterializedRelation<>("Data vectors", "cop-datavec", TypeUtil.MATRIX, cop_datav, ids));
result.addChildResult(new MaterializedRelation<>("Correlation analysis", "cop-sol", new SimpleTypeInformation<CorrelationAnalysisSolution<?>>(CorrelationAnalysisSolution.class), cop_sol, ids));
return result;
}
use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.
the class DiSHPreferenceVectorIndex method determinePreferenceVectorByMaxIntersection.
/**
* Determines the preference vector with the max intersection strategy.
*
* @param neighborIDs the list of ids of the neighbors in each dimension
* @param msg a string buffer for debug messages
* @return the preference vector
*/
private long[] determinePreferenceVectorByMaxIntersection(ModifiableDBIDs[] neighborIDs, StringBuilder msg) {
int dimensionality = neighborIDs.length;
long[] preferenceVector = BitsUtil.zero(dimensionality);
Map<Integer, ModifiableDBIDs> candidates = new HashMap<>(dimensionality);
for (int i = 0; i < dimensionality; i++) {
ModifiableDBIDs s_i = neighborIDs[i];
if (s_i.size() > minpts) {
candidates.put(i, s_i);
}
}
if (msg != null) {
msg.append("\n candidates ").append(candidates.keySet());
}
if (!candidates.isEmpty()) {
int i = max(candidates);
ModifiableDBIDs intersection = candidates.remove(i);
BitsUtil.setI(preferenceVector, i);
while (!candidates.isEmpty()) {
ModifiableDBIDs newIntersection = DBIDUtil.newHashSet();
i = maxIntersection(candidates, intersection, newIntersection);
ModifiableDBIDs s_i = candidates.remove(i);
// TODO: aren't we re-computing the same intersection here?
newIntersection = DBIDUtil.intersection(intersection, s_i);
intersection = newIntersection;
if (intersection.size() < minpts) {
break;
}
BitsUtil.setI(preferenceVector, i);
}
}
if (msg != null) {
msg.append("\n preference ").append(BitsUtil.toStringLow(preferenceVector, dimensionality));
LOG.debug(msg.toString());
}
return preferenceVector;
}
Aggregations