Search in sources :

Example 1 with Subspace

use of de.lmu.ifi.dbs.elki.data.Subspace in project elki by elki-project.

the class DiSH method isParent.

/**
 * Returns true, if the specified parent cluster is a parent of one child of
 * the children clusters.
 *
 * @param relation the database containing the objects
 * @param parent the parent to be tested
 * @param iter the list of children to be tested
 * @param db_dim Database dimensionality
 * @return true, if the specified parent cluster is a parent of one child of
 *         the children clusters, false otherwise
 */
private boolean isParent(Relation<V> relation, Cluster<SubspaceModel> parent, It<Cluster<SubspaceModel>> iter, int db_dim) {
    Subspace s_p = parent.getModel().getSubspace();
    NumberVector parent_centroid = ProjectedCentroid.make(s_p.getDimensions(), relation, parent.getIDs());
    int subspaceDim_parent = db_dim - s_p.dimensionality();
    for (; iter.valid(); iter.advance()) {
        Cluster<SubspaceModel> child = iter.get();
        Subspace s_c = child.getModel().getSubspace();
        NumberVector child_centroid = ProjectedCentroid.make(s_c.getDimensions(), relation, child.getIDs());
        long[] commonPreferenceVector = BitsUtil.andCMin(s_p.getDimensions(), s_c.getDimensions());
        int subspaceDim = subspaceDimensionality(parent_centroid, child_centroid, s_p.getDimensions(), s_c.getDimensions(), commonPreferenceVector);
        if (subspaceDim == subspaceDim_parent) {
            return true;
        }
    }
    return false;
}
Also used : NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel)

Example 2 with Subspace

use of de.lmu.ifi.dbs.elki.data.Subspace in project elki by elki-project.

the class DiSH method sortClusters.

/**
 * Returns a sorted list of the clusters w.r.t. the subspace dimensionality in
 * descending order.
 *
 * @param relation the database storing the objects
 * @param clustersMap the mapping of bits sets to clusters
 * @return a sorted list of the clusters
 */
private List<Cluster<SubspaceModel>> sortClusters(Relation<V> relation, Object2ObjectMap<long[], List<ArrayModifiableDBIDs>> clustersMap) {
    final int db_dim = RelationUtil.dimensionality(relation);
    // int num = 1;
    List<Cluster<SubspaceModel>> clusters = new ArrayList<>();
    for (long[] pv : clustersMap.keySet()) {
        List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv);
        for (int i = 0; i < parallelClusters.size(); i++) {
            ArrayModifiableDBIDs c = parallelClusters.get(i);
            Cluster<SubspaceModel> cluster = new Cluster<>(c);
            cluster.setModel(new SubspaceModel(new Subspace(pv), Centroid.make(relation, c).getArrayRef()));
            String subspace = BitsUtil.toStringLow(cluster.getModel().getSubspace().getDimensions(), db_dim);
            if (parallelClusters.size() > 1) {
                cluster.setName("Cluster_" + subspace + "_" + i);
            } else {
                cluster.setName("Cluster_" + subspace);
            }
            clusters.add(cluster);
        }
    }
    // sort the clusters w.r.t. lambda
    Comparator<Cluster<SubspaceModel>> comparator = new Comparator<Cluster<SubspaceModel>>() {

        @Override
        public int compare(Cluster<SubspaceModel> c1, Cluster<SubspaceModel> c2) {
            return c2.getModel().getSubspace().dimensionality() - c1.getModel().getSubspace().dimensionality();
        }
    };
    Collections.sort(clusters, comparator);
    return clusters;
}
Also used : ArrayList(java.util.ArrayList) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Comparator(java.util.Comparator) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) Subspace(de.lmu.ifi.dbs.elki.data.Subspace)

Example 3 with Subspace

use of de.lmu.ifi.dbs.elki.data.Subspace in project elki by elki-project.

the class P3C method run.

/**
 * Performs the P3C algorithm on the given Database.
 */
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    // Overall progress.
    StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
    if (stepProgress != null) {
        stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
    }
    // Desired number of bins, as per Sturge:
    final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
    // Perform 1-dimensional projections, and split into bins.
    SetDBIDs[][] partitions = partitionData(relation, binCount);
    if (stepProgress != null) {
        stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
    }
    // Set markers for each attribute until they're all deemed uniform.
    final long[][] markers = new long[dim][];
    for (int d = 0; d < dim; d++) {
        final SetDBIDs[] parts = partitions[d];
        if (parts == null) {
            // Never mark any on constant dimensions.
            continue;
        }
        final long[] marked = markers[d] = BitsUtil.zero(binCount);
        int card = 0;
        while (card < dim - 1) {
            // Find bin with largest support, test only the dimensions that were not
            // previously marked.
            int bestBin = chiSquaredUniformTest(parts, marked, card);
            if (bestBin < 0) {
                // Uniform
                break;
            }
            BitsUtil.setI(marked, bestBin);
            card++;
        }
        if (LOG.isDebugging()) {
            LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
        }
    }
    if (stepProgress != null) {
        stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
    }
    ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
    if (stepProgress != null) {
        stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
    }
    ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
    if (stepProgress != null) {
        stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
    }
    clusterCores = pruneRedundantClusterCores(clusterCores);
    if (LOG.isVerbose()) {
        LOG.verbose("Number of cluster cores found: " + clusterCores.size());
    }
    if (clusterCores.isEmpty()) {
        LOG.setCompleted(stepProgress);
        Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
        c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
        return c;
    }
    if (stepProgress != null) {
        stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
    }
    // Track objects not assigned to any cluster:
    ModifiableDBIDs noise = DBIDUtil.newHashSet();
    WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
    int k = clusterCores.size();
    List<MultivariateGaussianModel> models = new ArrayList<>(k);
    computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
    // Initial estimate of covariances, to assign noise objects
    EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
    assignUnassigned(relation, probClusterIGivenX, models, noise);
    double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
    for (int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
        final double emOld = emNew;
        EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
        // reassign probabilities
        emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
        if (LOG.isVerbose()) {
            LOG.verbose("iteration " + it + " - expectation value: " + emNew);
        }
        if ((emNew - emOld) <= emDelta) {
            break;
        }
    }
    if (stepProgress != null) {
        stepProgress.beginStep(6, "Generating hard clustering.", LOG);
    }
    // Create a hard clustering, making sure each data point only is part of one
    // cluster, based on the best match from the membership matrix.
    ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
    if (stepProgress != null) {
        stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
    }
    // Outlier detection. Remove points from clusters that have a Mahalanobis
    // distance larger than the critical value of the ChiSquare distribution.
    findOutliers(relation, models, clusterCandidates, noise);
    if (stepProgress != null) {
        stepProgress.beginStep(8, "Removing empty clusters.", LOG);
    }
    // Remove near-empty clusters.
    for (Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext(); ) {
        ClusterCandidate cand = it.next();
        final int size = cand.ids.size();
        if (size < minClusterSize) {
            if (size > 0) {
                noise.addDBIDs(cand.ids);
            }
            it.remove();
        }
    }
    if (LOG.isVerbose()) {
        LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
    }
    if (stepProgress != null) {
        stepProgress.beginStep(9, "Generating final result.", LOG);
    }
    // Generate final output.
    Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
    for (int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
        ClusterCandidate candidate = clusterCandidates.get(cluster);
        CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
        result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
    }
    LOG.verbose("Noise size: " + noise.size());
    if (noise.size() > 0) {
        result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
    }
    LOG.ensureCompleted(stepProgress);
    return result;
}
Also used : ArrayList(java.util.ArrayList) MultivariateGaussianModel(de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) SetDBIDs(de.lmu.ifi.dbs.elki.database.ids.SetDBIDs) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 4 with Subspace

use of de.lmu.ifi.dbs.elki.data.Subspace in project elki by elki-project.

the class DOC method makeCluster.

/**
 * Utility method to create a subspace cluster from a list of DBIDs and the
 * relevant attributes.
 *
 * @param relation to compute a centroid.
 * @param C the cluster points.
 * @param D the relevant dimensions.
 * @return an object representing the subspace cluster.
 */
protected Cluster<SubspaceModel> makeCluster(Relation<V> relation, DBIDs C, long[] D) {
    // copy, also to lose distance values!
    DBIDs ids = DBIDUtil.newHashSet(C);
    Cluster<SubspaceModel> cluster = new Cluster<>(ids);
    cluster.setModel(new SubspaceModel(new Subspace(D), Centroid.make(relation, ids).getArrayRef()));
    return cluster;
}
Also used : SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) Cluster(de.lmu.ifi.dbs.elki.data.Cluster)

Example 5 with Subspace

use of de.lmu.ifi.dbs.elki.data.Subspace in project elki by elki-project.

the class DOC method run.

/**
 * Performs the DOC or FastDOC (as configured) algorithm on the given
 * Database.
 *
 * This will run exhaustively, i.e. run DOC until no clusters are found
 * anymore / the database size has shrunk below the threshold for minimum
 * cluster size.
 *
 * @param database Database
 * @param relation Data relation
 */
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
    // Dimensionality of our set.
    final int d = RelationUtil.dimensionality(relation);
    // Get available DBIDs as a set we can remove items from.
    ArrayModifiableDBIDs S = DBIDUtil.newArray(relation.getDBIDs());
    // Precompute values as described in Figure 2.
    double r = Math.abs(FastMath.log(d + d) / FastMath.log(beta * .5));
    // Outer loop count.
    int n = (int) (2. / alpha);
    // Inner loop count.
    int m = (int) (FastMath.pow(2. / alpha, r) * FastMath.log(4));
    // TODO: This should only apply for FastDOC.
    m = Math.min(m, Math.min(1000000, d * d));
    // Minimum size for a cluster for it to be accepted.
    int minClusterSize = (int) (alpha * S.size());
    // List of all clusters we found.
    Clustering<SubspaceModel> result = new Clustering<>("DOC Clusters", "DOC");
    // Inform the user about the number of actual clusters found so far.
    IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;
    // of points is empty.
    while (S.size() > minClusterSize) {
        Cluster<SubspaceModel> C = runDOC(database, relation, S, d, n, m, (int) r, minClusterSize);
        if (C == null) {
            // Stop trying if we couldn't find a cluster.
            break;
        }
        // Found a cluster, remember it, remove its points from the set.
        result.addToplevelCluster(C);
        // Remove all points of the cluster from the set and continue.
        S.removeDBIDs(C.getIDs());
        if (cprogress != null) {
            cprogress.setProcessed(result.getAllClusters().size(), LOG);
        }
    }
    // Add the remainder as noise.
    if (S.size() > 0) {
        long[] alldims = BitsUtil.ones(d);
        result.addToplevelCluster(new Cluster<>(S, true, new SubspaceModel(new Subspace(alldims), Centroid.make(relation, S).getArrayRef())));
    }
    LOG.setCompleted(cprogress);
    return result;
}
Also used : IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) Clustering(de.lmu.ifi.dbs.elki.data.Clustering)

Aggregations

Subspace (de.lmu.ifi.dbs.elki.data.Subspace)13 SubspaceModel (de.lmu.ifi.dbs.elki.data.model.SubspaceModel)10 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)7 ArrayList (java.util.ArrayList)7 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)4 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)4 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)3 StepProgress (de.lmu.ifi.dbs.elki.logging.progress.StepProgress)3 Pair (de.lmu.ifi.dbs.elki.utilities.pairs.Pair)3 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)2 Model (de.lmu.ifi.dbs.elki.data.model.Model)2 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)2 List (java.util.List)2 AbstractProjectedClustering (de.lmu.ifi.dbs.elki.algorithm.clustering.AbstractProjectedClustering)1 MultivariateGaussianModel (de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel)1 CLIQUESubspace (de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUESubspace)1 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)1 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)1 HashSetModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs)1 SetDBIDs (de.lmu.ifi.dbs.elki.database.ids.SetDBIDs)1