Search in sources :

Example 46 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class CASH method doRun.

/**
 * Runs the CASH algorithm on the specified database, this method is
 * recursively called until only noise is left.
 *
 * @param relation the Relation to run the CASH algorithm on
 * @param progress the progress object for verbose messages
 * @return a mapping of subspace dimensionalities to clusters
 */
private Clustering<Model> doRun(Relation<ParameterizationFunction> relation, FiniteProgress progress) {
    Clustering<Model> res = new Clustering<>("CASH clustering", "cash-clustering");
    final int dim = dimensionality(relation);
    // init heap
    ObjectHeap<IntegerPriorityObject<CASHInterval>> heap = new ComparableMinHeap<>();
    ModifiableDBIDs noiseIDs = DBIDUtil.newHashSet(relation.getDBIDs());
    initHeap(heap, relation, dim, noiseIDs);
    if (LOG.isVerbose()) {
        LOG.verbose(new StringBuilder().append("dim ").append(dim).append(" database.size ").append(relation.size()).toString());
    }
    // get the ''best'' d-dimensional intervals at max level
    while (!heap.isEmpty()) {
        CASHInterval interval = determineNextIntervalAtMaxLevel(heap);
        if (LOG.isVerbose()) {
            LOG.verbose("next interval in dim " + dim + ": " + interval);
        }
        // only noise left
        if (interval == null) {
            break;
        }
        // do a dim-1 dimensional run
        ModifiableDBIDs clusterIDs = DBIDUtil.newHashSet();
        if (dim > minDim + 1) {
            ModifiableDBIDs ids;
            double[][] basis_dim_minus_1;
            if (adjust) {
                ids = DBIDUtil.newHashSet();
                basis_dim_minus_1 = runDerivator(relation, dim, interval, ids);
            } else {
                ids = interval.getIDs();
                basis_dim_minus_1 = determineBasis(SpatialUtil.centroid(interval));
            }
            if (ids.size() != 0) {
                MaterializedRelation<ParameterizationFunction> db = buildDB(dim, basis_dim_minus_1, ids, relation);
                // add result of dim-1 to this result
                Clustering<Model> res_dim_minus_1 = doRun(db, progress);
                for (Cluster<Model> cluster : res_dim_minus_1.getAllClusters()) {
                    res.addToplevelCluster(cluster);
                    noiseIDs.removeDBIDs(cluster.getIDs());
                    clusterIDs.addDBIDs(cluster.getIDs());
                    processedIDs.addDBIDs(cluster.getIDs());
                }
            }
        } else // dim == minDim
        {
            LinearEquationSystem les = runDerivator(relation, dim - 1, interval.getIDs());
            Cluster<Model> c = new Cluster<Model>(interval.getIDs(), new LinearEquationModel(les));
            res.addToplevelCluster(c);
            noiseIDs.removeDBIDs(interval.getIDs());
            clusterIDs.addDBIDs(interval.getIDs());
            processedIDs.addDBIDs(interval.getIDs());
        }
        // Rebuild heap
        ArrayList<IntegerPriorityObject<CASHInterval>> heapVector = new ArrayList<>(heap.size());
        for (ObjectHeap.UnsortedIter<IntegerPriorityObject<CASHInterval>> iter = heap.unsortedIter(); iter.valid(); iter.advance()) {
            heapVector.add(iter.get());
        }
        heap.clear();
        for (IntegerPriorityObject<CASHInterval> pair : heapVector) {
            CASHInterval currentInterval = pair.getObject();
            currentInterval.removeIDs(clusterIDs);
            if (currentInterval.getIDs().size() >= minPts) {
                heap.add(new IntegerPriorityObject<>(currentInterval.priority(), currentInterval));
            }
        }
        if (progress != null) {
            progress.setProcessed(processedIDs.size(), LOG);
        }
    }
    // put noise to clusters
    if (!noiseIDs.isEmpty()) {
        if (dim == noiseDim) {
            res.addToplevelCluster(new Cluster<Model>(noiseIDs, true, ClusterModel.CLUSTER));
            processedIDs.addDBIDs(noiseIDs);
        } else if (noiseIDs.size() >= minPts) {
            LinearEquationSystem les = runDerivator(fulldatabase, dim - 1, noiseIDs);
            res.addToplevelCluster(new Cluster<Model>(noiseIDs, true, new LinearEquationModel(les)));
            processedIDs.addDBIDs(noiseIDs);
        }
    }
    if (LOG.isDebugging()) {
        StringBuilder msg = new StringBuilder();
        msg.append("noise fuer dim ").append(dim).append(": ").append(noiseIDs.size());
        for (Cluster<Model> c : res.getAllClusters()) {
            if (c.getModel() instanceof LinearEquationModel) {
                msg.append("\n Cluster: Dim: ").append(((LinearEquationModel) c.getModel()).getLes().subspacedim());
            } else {
                msg.append("\n Cluster: ").append(c.getModel().getClass().getName());
            }
            msg.append(" size: ").append(c.size());
        }
        LOG.debugFine(msg.toString());
    }
    if (progress != null) {
        progress.setProcessed(processedIDs.size(), LOG);
    }
    return res;
}
Also used : CASHInterval(de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.CASHInterval) ComparableMinHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMinHeap) ArrayList(java.util.ArrayList) ObjectHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ObjectHeap) LinearEquationModel(de.lmu.ifi.dbs.elki.data.model.LinearEquationModel) ClusterModel(de.lmu.ifi.dbs.elki.data.model.ClusterModel) Model(de.lmu.ifi.dbs.elki.data.model.Model) IntegerPriorityObject(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.IntegerPriorityObject) ParameterizationFunction(de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.ParameterizationFunction) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) LinearEquationSystem(de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem) LinearEquationModel(de.lmu.ifi.dbs.elki.data.model.LinearEquationModel)

Example 47 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class CASHInterval method split.

/**
 * Splits this interval into 2 children.
 */
public void split() {
    if (hasChildren()) {
        return;
    }
    final boolean issplit = (maxSplitDimension >= (getDimensionality() - 1));
    final int childLevel = issplit ? level + 1 : level;
    final int splitDim = issplit ? 0 : maxSplitDimension + 1;
    final double splitPoint = getMin(splitDim) + (getMax(splitDim) - getMin(splitDim)) * .5;
    // left and right child
    for (int i = 0; i < 2; i++) {
        // clone
        double[] min = SpatialUtil.getMin(this);
        // clone
        double[] max = SpatialUtil.getMax(this);
        // right child
        if (i == 0) {
            min[splitDim] = splitPoint;
        } else // left child
        {
            max[splitDim] = splitPoint;
        }
        ModifiableDBIDs childIDs = split.determineIDs(getIDs(), new HyperBoundingBox(min, max), d_min, d_max);
        if (childIDs != null) {
            // right child
            if (i == 0) {
                rightChild = new CASHInterval(min, max, split, childIDs, splitDim, childLevel, d_min, d_max);
            } else // left child
            {
                leftChild = new CASHInterval(min, max, split, childIDs, splitDim, childLevel, d_min, d_max);
            }
        }
    }
    if (LOG.isDebuggingFine()) {
        StringBuilder msg = new StringBuilder();
        msg.append("Child level ").append(childLevel).append(",  split Dim   ").append(splitDim);
        if (leftChild != null) {
            msg.append("\nleft   ").append(leftChild);
        }
        if (rightChild != null) {
            msg.append("\nright   ").append(rightChild);
        }
        LOG.fine(msg.toString());
    }
}
Also used : ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) HyperBoundingBox(de.lmu.ifi.dbs.elki.data.HyperBoundingBox)

Example 48 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class CASHIntervalSplit method determineIDs.

/**
 * Determines the ids belonging to the given interval, i.e. the
 * parameterization functions falling within the interval.
 *
 * @param superSetIDs a superset of the ids to be determined
 * @param interval the hyper bounding box defining the interval of alpha
 *        values
 * @param d_min the minimum distance value for the interval
 * @param d_max the maximum distance value for the interval
 * @return the ids belonging to the given interval, if the number ids of
 *         exceeds minPts, null otherwise
 */
public ModifiableDBIDs determineIDs(DBIDs superSetIDs, HyperBoundingBox interval, double d_min, double d_max) {
    StringBuilder msg = LOG.isDebugging() ? new StringBuilder() : null;
    if (msg != null) {
        msg.append("interval ").append(interval);
    }
    ModifiableDBIDs childIDs = DBIDUtil.newHashSet(superSetIDs.size());
    Map<DBID, Double> minima = f_minima.get(interval);
    Map<DBID, Double> maxima = f_maxima.get(interval);
    if (minima == null || maxima == null) {
        minima = new HashMap<>();
        f_minima.put(interval, minima);
        maxima = new HashMap<>();
        f_maxima.put(interval, maxima);
    }
    for (DBIDIter iter = superSetIDs.iter(); iter.valid(); iter.advance()) {
        DBID id = DBIDUtil.deref(iter);
        Double f_min = minima.get(id);
        Double f_max = maxima.get(id);
        if (f_min == null) {
            ParameterizationFunction f = database.get(id);
            HyperBoundingBox minMax = f.determineAlphaMinMax(interval);
            f_min = f.function(SpatialUtil.getMin(minMax));
            f_max = f.function(SpatialUtil.getMax(minMax));
            minima.put(id, f_min);
            maxima.put(id, f_max);
        }
        if (msg != null) {
            msg.append("\n\nf_min ").append(f_min);
            msg.append("\nf_max ").append(f_max);
            msg.append("\nd_min ").append(d_min);
            msg.append("\nd_max ").append(d_max);
        }
        if (f_min - f_max > ParameterizationFunction.DELTA) {
            throw new IllegalArgumentException("Houston, we have a problem: f_min > f_max! " + "\nf_min[" + FormatUtil.format(SpatialUtil.centroid(interval)) + "] = " + f_min + "\nf_max[" + FormatUtil.format(SpatialUtil.centroid(interval)) + "] = " + f_max + "\nf " + database.get(id));
        }
        if (f_min <= d_max && f_max >= d_min) {
            childIDs.add(id);
            if (msg != null) {
                msg.append("\nid ").append(id).append(" appended");
            }
        } else {
            if (msg != null) {
                msg.append("\nid ").append(id).append(" NOT appended");
            }
        }
    }
    if (msg != null) {
        msg.append("\nchildIds ").append(childIDs.size());
        LOG.debugFine(msg.toString());
    }
    if (childIDs.size() < minPts) {
        return null;
    } else {
        return childIDs;
    }
}
Also used : DBID(de.lmu.ifi.dbs.elki.database.ids.DBID) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) HyperBoundingBox(de.lmu.ifi.dbs.elki.data.HyperBoundingBox) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 49 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class SimpleCOP method run.

public OutlierResult run(Database database, Relation<V> data) throws IllegalStateException {
    KNNQuery<V> knnQuery = QueryUtil.getKNNQuery(data, getDistanceFunction(), k + 1);
    DBIDs ids = data.getDBIDs();
    WritableDoubleDataStore cop_score = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
    WritableDataStore<double[]> cop_err_v = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, double[].class);
    WritableDataStore<double[][]> cop_datav = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, double[][].class);
    WritableIntegerDataStore cop_dim = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, -1);
    WritableDataStore<CorrelationAnalysisSolution<?>> cop_sol = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, CorrelationAnalysisSolution.class);
    {
        // compute neighbors of each db object
        FiniteProgress progressLocalPCA = LOG.isVerbose() ? new FiniteProgress("Correlation Outlier Probabilities", data.size(), LOG) : null;
        double sqrt2 = MathUtil.SQRT2;
        for (DBIDIter id = data.iterDBIDs(); id.valid(); id.advance()) {
            KNNList neighbors = knnQuery.getKNNForDBID(id, k + 1);
            ModifiableDBIDs nids = DBIDUtil.newArray(neighbors);
            nids.remove(id);
            // TODO: do we want to use the query point as centroid?
            CorrelationAnalysisSolution<V> depsol = dependencyDerivator.generateModel(data, nids);
            double stddev = depsol.getStandardDeviation();
            double distance = depsol.distance(data.get(id));
            double prob = NormalDistribution.erf(distance / (stddev * sqrt2));
            cop_score.putDouble(id, prob);
            cop_err_v.put(id, times(depsol.errorVector(data.get(id)), -1));
            double[][] datav = depsol.dataProjections(data.get(id));
            cop_datav.put(id, datav);
            cop_dim.putInt(id, depsol.getCorrelationDimensionality());
            cop_sol.put(id, depsol);
            LOG.incrementProcessed(progressLocalPCA);
        }
        LOG.ensureCompleted(progressLocalPCA);
    }
    // combine results.
    DoubleRelation scoreResult = new MaterializedDoubleRelation("Original Correlation Outlier Probabilities", "origcop-outlier", cop_score, ids);
    OutlierScoreMeta scoreMeta = new ProbabilisticOutlierScore();
    OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
    // extra results
    result.addChildResult(new MaterializedRelation<>("Local Dimensionality", COP.COP_DIM, TypeUtil.INTEGER, cop_dim, ids));
    result.addChildResult(new MaterializedRelation<>("Error vectors", COP.COP_ERRORVEC, TypeUtil.DOUBLE_ARRAY, cop_err_v, ids));
    result.addChildResult(new MaterializedRelation<>("Data vectors", "cop-datavec", TypeUtil.MATRIX, cop_datav, ids));
    result.addChildResult(new MaterializedRelation<>("Correlation analysis", "cop-sol", new SimpleTypeInformation<CorrelationAnalysisSolution<?>>(CorrelationAnalysisSolution.class), cop_sol, ids));
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) OutlierResult(de.lmu.ifi.dbs.elki.result.outlier.OutlierResult) ProbabilisticOutlierScore(de.lmu.ifi.dbs.elki.result.outlier.ProbabilisticOutlierScore) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) DoubleRelation(de.lmu.ifi.dbs.elki.database.relation.DoubleRelation) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation) OutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) CorrelationAnalysisSolution(de.lmu.ifi.dbs.elki.data.model.CorrelationAnalysisSolution) KNNList(de.lmu.ifi.dbs.elki.database.ids.KNNList) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)

Example 50 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class DiSHPreferenceVectorIndex method determinePreferenceVectorByMaxIntersection.

/**
 * Determines the preference vector with the max intersection strategy.
 *
 * @param neighborIDs the list of ids of the neighbors in each dimension
 * @param msg a string buffer for debug messages
 * @return the preference vector
 */
private long[] determinePreferenceVectorByMaxIntersection(ModifiableDBIDs[] neighborIDs, StringBuilder msg) {
    int dimensionality = neighborIDs.length;
    long[] preferenceVector = BitsUtil.zero(dimensionality);
    Map<Integer, ModifiableDBIDs> candidates = new HashMap<>(dimensionality);
    for (int i = 0; i < dimensionality; i++) {
        ModifiableDBIDs s_i = neighborIDs[i];
        if (s_i.size() > minpts) {
            candidates.put(i, s_i);
        }
    }
    if (msg != null) {
        msg.append("\n candidates ").append(candidates.keySet());
    }
    if (!candidates.isEmpty()) {
        int i = max(candidates);
        ModifiableDBIDs intersection = candidates.remove(i);
        BitsUtil.setI(preferenceVector, i);
        while (!candidates.isEmpty()) {
            ModifiableDBIDs newIntersection = DBIDUtil.newHashSet();
            i = maxIntersection(candidates, intersection, newIntersection);
            ModifiableDBIDs s_i = candidates.remove(i);
            // TODO: aren't we re-computing the same intersection here?
            newIntersection = DBIDUtil.intersection(intersection, s_i);
            intersection = newIntersection;
            if (intersection.size() < minpts) {
                break;
            }
            BitsUtil.setI(preferenceVector, i);
        }
    }
    if (msg != null) {
        msg.append("\n preference ").append(BitsUtil.toStringLow(preferenceVector, dimensionality));
        LOG.debug(msg.toString());
    }
    return preferenceVector;
}
Also used : HashMap(java.util.HashMap) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Aggregations

ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)80 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)44 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)30 ArrayList (java.util.ArrayList)30 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)28 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)18 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)15 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)14 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)14 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)12 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)12 Model (de.lmu.ifi.dbs.elki.data.model.Model)11 DBID (de.lmu.ifi.dbs.elki.database.ids.DBID)11 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)10 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)10 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)9 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)9 HashSetModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs)8 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)8 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)7