Search in sources :

Example 1 with ParameterizationFunction

use of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.ParameterizationFunction in project elki by elki-project.

the class CASH method buildDB.

/**
 * Builds a dim-1 dimensional database where the objects are projected into
 * the specified subspace.
 *
 * @param dim the dimensionality of the database
 * @param basis the basis defining the subspace
 * @param ids the ids for the new database
 * @param relation the database storing the parameterization functions
 * @return a dim-1 dimensional database where the objects are projected into
 *         the specified subspace
 */
private MaterializedRelation<ParameterizationFunction> buildDB(int dim, double[][] basis, DBIDs ids, Relation<ParameterizationFunction> relation) {
    ProxyDatabase proxy = new ProxyDatabase(ids);
    SimpleTypeInformation<ParameterizationFunction> type = new SimpleTypeInformation<>(ParameterizationFunction.class);
    WritableDataStore<ParameterizationFunction> prep = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT, ParameterizationFunction.class);
    // Project
    for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
        prep.put(iter, project(basis, relation.get(iter)));
    }
    if (LOG.isDebugging()) {
        LOG.debugFine("db fuer dim " + (dim - 1) + ": " + ids.size());
    }
    MaterializedRelation<ParameterizationFunction> prel = new MaterializedRelation<>(type, ids, null, prep);
    proxy.addRelation(prel);
    return prel;
}
Also used : ProxyDatabase(de.lmu.ifi.dbs.elki.database.ProxyDatabase) ParameterizationFunction(de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.ParameterizationFunction) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) MaterializedRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation)

Example 2 with ParameterizationFunction

use of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.ParameterizationFunction in project elki by elki-project.

the class CASH method preprocess.

/**
 * Preprocess the dataset, precomputing the parameterization functions.
 *
 * @param db Database
 * @param vrel Vector relation
 * @return Preprocessed relation
 */
private Relation<ParameterizationFunction> preprocess(Database db, Relation<V> vrel) {
    DBIDs ids = vrel.getDBIDs();
    SimpleTypeInformation<ParameterizationFunction> type = new SimpleTypeInformation<>(ParameterizationFunction.class);
    WritableDataStore<ParameterizationFunction> prep = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT, ParameterizationFunction.class);
    // Project
    for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
        prep.put(iter, new ParameterizationFunction(vrel.get(iter)));
    }
    return new MaterializedRelation<>(type, ids, null, prep);
}
Also used : DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) ParameterizationFunction(de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.ParameterizationFunction) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) MaterializedRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation)

Example 3 with ParameterizationFunction

use of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.ParameterizationFunction in project elki by elki-project.

the class CASH method determineMinMaxDistance.

/**
 * Determines the minimum and maximum function value of all parameterization
 * functions stored in the specified database.
 *
 * @param relation the database containing the parameterization functions.
 * @param dimensionality the dimensionality of the database
 * @return an array containing the minimum and maximum function value of all
 *         parameterization functions stored in the specified database
 */
private double[] determineMinMaxDistance(Relation<ParameterizationFunction> relation, int dimensionality) {
    double[] min = new double[dimensionality - 1];
    double[] max = new double[dimensionality - 1];
    Arrays.fill(max, Math.PI);
    HyperBoundingBox box = new HyperBoundingBox(min, max);
    double d_min = Double.POSITIVE_INFINITY, d_max = Double.NEGATIVE_INFINITY;
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        ParameterizationFunction f = relation.get(iditer);
        HyperBoundingBox minMax = f.determineAlphaMinMax(box);
        double f_min = f.function(SpatialUtil.getMin(minMax));
        double f_max = f.function(SpatialUtil.getMax(minMax));
        d_min = Math.min(d_min, f_min);
        d_max = Math.max(d_max, f_max);
    }
    return new double[] { d_min, d_max };
}
Also used : ParameterizationFunction(de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.ParameterizationFunction) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 4 with ParameterizationFunction

use of de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.ParameterizationFunction in project elki by elki-project.

the class CASH method doRun.

/**
 * Runs the CASH algorithm on the specified database, this method is
 * recursively called until only noise is left.
 *
 * @param relation the Relation to run the CASH algorithm on
 * @param progress the progress object for verbose messages
 * @return a mapping of subspace dimensionalities to clusters
 */
private Clustering<Model> doRun(Relation<ParameterizationFunction> relation, FiniteProgress progress) {
    Clustering<Model> res = new Clustering<>("CASH clustering", "cash-clustering");
    final int dim = dimensionality(relation);
    // init heap
    ObjectHeap<IntegerPriorityObject<CASHInterval>> heap = new ComparableMinHeap<>();
    ModifiableDBIDs noiseIDs = DBIDUtil.newHashSet(relation.getDBIDs());
    initHeap(heap, relation, dim, noiseIDs);
    if (LOG.isVerbose()) {
        LOG.verbose(new StringBuilder().append("dim ").append(dim).append(" database.size ").append(relation.size()).toString());
    }
    // get the ''best'' d-dimensional intervals at max level
    while (!heap.isEmpty()) {
        CASHInterval interval = determineNextIntervalAtMaxLevel(heap);
        if (LOG.isVerbose()) {
            LOG.verbose("next interval in dim " + dim + ": " + interval);
        }
        // only noise left
        if (interval == null) {
            break;
        }
        // do a dim-1 dimensional run
        ModifiableDBIDs clusterIDs = DBIDUtil.newHashSet();
        if (dim > minDim + 1) {
            ModifiableDBIDs ids;
            double[][] basis_dim_minus_1;
            if (adjust) {
                ids = DBIDUtil.newHashSet();
                basis_dim_minus_1 = runDerivator(relation, dim, interval, ids);
            } else {
                ids = interval.getIDs();
                basis_dim_minus_1 = determineBasis(SpatialUtil.centroid(interval));
            }
            if (ids.size() != 0) {
                MaterializedRelation<ParameterizationFunction> db = buildDB(dim, basis_dim_minus_1, ids, relation);
                // add result of dim-1 to this result
                Clustering<Model> res_dim_minus_1 = doRun(db, progress);
                for (Cluster<Model> cluster : res_dim_minus_1.getAllClusters()) {
                    res.addToplevelCluster(cluster);
                    noiseIDs.removeDBIDs(cluster.getIDs());
                    clusterIDs.addDBIDs(cluster.getIDs());
                    processedIDs.addDBIDs(cluster.getIDs());
                }
            }
        } else // dim == minDim
        {
            LinearEquationSystem les = runDerivator(relation, dim - 1, interval.getIDs());
            Cluster<Model> c = new Cluster<Model>(interval.getIDs(), new LinearEquationModel(les));
            res.addToplevelCluster(c);
            noiseIDs.removeDBIDs(interval.getIDs());
            clusterIDs.addDBIDs(interval.getIDs());
            processedIDs.addDBIDs(interval.getIDs());
        }
        // Rebuild heap
        ArrayList<IntegerPriorityObject<CASHInterval>> heapVector = new ArrayList<>(heap.size());
        for (ObjectHeap.UnsortedIter<IntegerPriorityObject<CASHInterval>> iter = heap.unsortedIter(); iter.valid(); iter.advance()) {
            heapVector.add(iter.get());
        }
        heap.clear();
        for (IntegerPriorityObject<CASHInterval> pair : heapVector) {
            CASHInterval currentInterval = pair.getObject();
            currentInterval.removeIDs(clusterIDs);
            if (currentInterval.getIDs().size() >= minPts) {
                heap.add(new IntegerPriorityObject<>(currentInterval.priority(), currentInterval));
            }
        }
        if (progress != null) {
            progress.setProcessed(processedIDs.size(), LOG);
        }
    }
    // put noise to clusters
    if (!noiseIDs.isEmpty()) {
        if (dim == noiseDim) {
            res.addToplevelCluster(new Cluster<Model>(noiseIDs, true, ClusterModel.CLUSTER));
            processedIDs.addDBIDs(noiseIDs);
        } else if (noiseIDs.size() >= minPts) {
            LinearEquationSystem les = runDerivator(fulldatabase, dim - 1, noiseIDs);
            res.addToplevelCluster(new Cluster<Model>(noiseIDs, true, new LinearEquationModel(les)));
            processedIDs.addDBIDs(noiseIDs);
        }
    }
    if (LOG.isDebugging()) {
        StringBuilder msg = new StringBuilder();
        msg.append("noise fuer dim ").append(dim).append(": ").append(noiseIDs.size());
        for (Cluster<Model> c : res.getAllClusters()) {
            if (c.getModel() instanceof LinearEquationModel) {
                msg.append("\n Cluster: Dim: ").append(((LinearEquationModel) c.getModel()).getLes().subspacedim());
            } else {
                msg.append("\n Cluster: ").append(c.getModel().getClass().getName());
            }
            msg.append(" size: ").append(c.size());
        }
        LOG.debugFine(msg.toString());
    }
    if (progress != null) {
        progress.setProcessed(processedIDs.size(), LOG);
    }
    return res;
}
Also used : CASHInterval(de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.CASHInterval) ComparableMinHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMinHeap) ArrayList(java.util.ArrayList) ObjectHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ObjectHeap) LinearEquationModel(de.lmu.ifi.dbs.elki.data.model.LinearEquationModel) ClusterModel(de.lmu.ifi.dbs.elki.data.model.ClusterModel) Model(de.lmu.ifi.dbs.elki.data.model.Model) IntegerPriorityObject(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.IntegerPriorityObject) ParameterizationFunction(de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.ParameterizationFunction) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) LinearEquationSystem(de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem) LinearEquationModel(de.lmu.ifi.dbs.elki.data.model.LinearEquationModel)

Aggregations

ParameterizationFunction (de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.ParameterizationFunction)4 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)3 SimpleTypeInformation (de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation)2 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)2 MaterializedRelation (de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation)2 CASHInterval (de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.CASHInterval)1 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)1 LinearEquationModel (de.lmu.ifi.dbs.elki.data.model.LinearEquationModel)1 Model (de.lmu.ifi.dbs.elki.data.model.Model)1 ProxyDatabase (de.lmu.ifi.dbs.elki.database.ProxyDatabase)1 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)1 LinearEquationSystem (de.lmu.ifi.dbs.elki.math.linearalgebra.LinearEquationSystem)1 ComparableMinHeap (de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ComparableMinHeap)1 IntegerPriorityObject (de.lmu.ifi.dbs.elki.utilities.datastructures.heap.IntegerPriorityObject)1 ObjectHeap (de.lmu.ifi.dbs.elki.utilities.datastructures.heap.ObjectHeap)1 ArrayList (java.util.ArrayList)1