Search in sources :

Example 6 with SubspaceModel

use of de.lmu.ifi.dbs.elki.data.model.SubspaceModel in project elki by elki-project.

the class P3C method run.

/**
 * Performs the P3C algorithm on the given Database.
 */
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    // Overall progress.
    StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
    if (stepProgress != null) {
        stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
    }
    // Desired number of bins, as per Sturge:
    final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
    // Perform 1-dimensional projections, and split into bins.
    SetDBIDs[][] partitions = partitionData(relation, binCount);
    if (stepProgress != null) {
        stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
    }
    // Set markers for each attribute until they're all deemed uniform.
    final long[][] markers = new long[dim][];
    for (int d = 0; d < dim; d++) {
        final SetDBIDs[] parts = partitions[d];
        if (parts == null) {
            // Never mark any on constant dimensions.
            continue;
        }
        final long[] marked = markers[d] = BitsUtil.zero(binCount);
        int card = 0;
        while (card < dim - 1) {
            // Find bin with largest support, test only the dimensions that were not
            // previously marked.
            int bestBin = chiSquaredUniformTest(parts, marked, card);
            if (bestBin < 0) {
                // Uniform
                break;
            }
            BitsUtil.setI(marked, bestBin);
            card++;
        }
        if (LOG.isDebugging()) {
            LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
        }
    }
    if (stepProgress != null) {
        stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
    }
    ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
    if (stepProgress != null) {
        stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
    }
    ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
    if (stepProgress != null) {
        stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
    }
    clusterCores = pruneRedundantClusterCores(clusterCores);
    if (LOG.isVerbose()) {
        LOG.verbose("Number of cluster cores found: " + clusterCores.size());
    }
    if (clusterCores.isEmpty()) {
        LOG.setCompleted(stepProgress);
        Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
        c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
        return c;
    }
    if (stepProgress != null) {
        stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
    }
    // Track objects not assigned to any cluster:
    ModifiableDBIDs noise = DBIDUtil.newHashSet();
    WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
    int k = clusterCores.size();
    List<MultivariateGaussianModel> models = new ArrayList<>(k);
    computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
    // Initial estimate of covariances, to assign noise objects
    EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
    assignUnassigned(relation, probClusterIGivenX, models, noise);
    double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
    for (int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
        final double emOld = emNew;
        EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
        // reassign probabilities
        emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
        if (LOG.isVerbose()) {
            LOG.verbose("iteration " + it + " - expectation value: " + emNew);
        }
        if ((emNew - emOld) <= emDelta) {
            break;
        }
    }
    if (stepProgress != null) {
        stepProgress.beginStep(6, "Generating hard clustering.", LOG);
    }
    // Create a hard clustering, making sure each data point only is part of one
    // cluster, based on the best match from the membership matrix.
    ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
    if (stepProgress != null) {
        stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
    }
    // Outlier detection. Remove points from clusters that have a Mahalanobis
    // distance larger than the critical value of the ChiSquare distribution.
    findOutliers(relation, models, clusterCandidates, noise);
    if (stepProgress != null) {
        stepProgress.beginStep(8, "Removing empty clusters.", LOG);
    }
    // Remove near-empty clusters.
    for (Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext(); ) {
        ClusterCandidate cand = it.next();
        final int size = cand.ids.size();
        if (size < minClusterSize) {
            if (size > 0) {
                noise.addDBIDs(cand.ids);
            }
            it.remove();
        }
    }
    if (LOG.isVerbose()) {
        LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
    }
    if (stepProgress != null) {
        stepProgress.beginStep(9, "Generating final result.", LOG);
    }
    // Generate final output.
    Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
    for (int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
        ClusterCandidate candidate = clusterCandidates.get(cluster);
        CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
        result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
    }
    LOG.verbose("Noise size: " + noise.size());
    if (noise.size() > 0) {
        result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
    }
    LOG.ensureCompleted(stepProgress);
    return result;
}
Also used : ArrayList(java.util.ArrayList) MultivariateGaussianModel(de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) SetDBIDs(de.lmu.ifi.dbs.elki.database.ids.SetDBIDs) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 7 with SubspaceModel

use of de.lmu.ifi.dbs.elki.data.model.SubspaceModel in project elki by elki-project.

the class DOC method makeCluster.

/**
 * Utility method to create a subspace cluster from a list of DBIDs and the
 * relevant attributes.
 *
 * @param relation to compute a centroid.
 * @param C the cluster points.
 * @param D the relevant dimensions.
 * @return an object representing the subspace cluster.
 */
protected Cluster<SubspaceModel> makeCluster(Relation<V> relation, DBIDs C, long[] D) {
    // copy, also to lose distance values!
    DBIDs ids = DBIDUtil.newHashSet(C);
    Cluster<SubspaceModel> cluster = new Cluster<>(ids);
    cluster.setModel(new SubspaceModel(new Subspace(D), Centroid.make(relation, ids).getArrayRef()));
    return cluster;
}
Also used : SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) Cluster(de.lmu.ifi.dbs.elki.data.Cluster)

Example 8 with SubspaceModel

use of de.lmu.ifi.dbs.elki.data.model.SubspaceModel in project elki by elki-project.

the class DOC method run.

/**
 * Performs the DOC or FastDOC (as configured) algorithm on the given
 * Database.
 *
 * This will run exhaustively, i.e. run DOC until no clusters are found
 * anymore / the database size has shrunk below the threshold for minimum
 * cluster size.
 *
 * @param database Database
 * @param relation Data relation
 */
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
    // Dimensionality of our set.
    final int d = RelationUtil.dimensionality(relation);
    // Get available DBIDs as a set we can remove items from.
    ArrayModifiableDBIDs S = DBIDUtil.newArray(relation.getDBIDs());
    // Precompute values as described in Figure 2.
    double r = Math.abs(FastMath.log(d + d) / FastMath.log(beta * .5));
    // Outer loop count.
    int n = (int) (2. / alpha);
    // Inner loop count.
    int m = (int) (FastMath.pow(2. / alpha, r) * FastMath.log(4));
    // TODO: This should only apply for FastDOC.
    m = Math.min(m, Math.min(1000000, d * d));
    // Minimum size for a cluster for it to be accepted.
    int minClusterSize = (int) (alpha * S.size());
    // List of all clusters we found.
    Clustering<SubspaceModel> result = new Clustering<>("DOC Clusters", "DOC");
    // Inform the user about the number of actual clusters found so far.
    IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;
    // of points is empty.
    while (S.size() > minClusterSize) {
        Cluster<SubspaceModel> C = runDOC(database, relation, S, d, n, m, (int) r, minClusterSize);
        if (C == null) {
            // Stop trying if we couldn't find a cluster.
            break;
        }
        // Found a cluster, remember it, remove its points from the set.
        result.addToplevelCluster(C);
        // Remove all points of the cluster from the set and continue.
        S.removeDBIDs(C.getIDs());
        if (cprogress != null) {
            cprogress.setProcessed(result.getAllClusters().size(), LOG);
        }
    }
    // Add the remainder as noise.
    if (S.size() > 0) {
        long[] alldims = BitsUtil.ones(d);
        result.addToplevelCluster(new Cluster<>(S, true, new SubspaceModel(new Subspace(alldims), Centroid.make(relation, S).getArrayRef())));
    }
    LOG.setCompleted(cprogress);
    return result;
}
Also used : IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) Clustering(de.lmu.ifi.dbs.elki.data.Clustering)

Example 9 with SubspaceModel

use of de.lmu.ifi.dbs.elki.data.model.SubspaceModel in project elki by elki-project.

the class PROCLUS method run.

/**
 * Performs the PROCLUS algorithm on the given database.
 *
 * @param database Database to process
 * @param relation Relation to process
 */
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
    if (RelationUtil.dimensionality(relation) < l) {
        throw new IllegalStateException("Dimensionality of data < parameter l! (" + RelationUtil.dimensionality(relation) + " < " + l + ")");
    }
    DistanceQuery<V> distFunc = database.getDistanceQuery(relation, SquaredEuclideanDistanceFunction.STATIC);
    RangeQuery<V> rangeQuery = database.getRangeQuery(distFunc);
    final Random random = rnd.getSingleThreadedRandom();
    // initialization phase
    if (LOG.isVerbose()) {
        LOG.verbose("1. Initialization phase...");
    }
    int sampleSize = Math.min(relation.size(), k_i * k);
    DBIDs sampleSet = DBIDUtil.randomSample(relation.getDBIDs(), sampleSize, random);
    int medoidSize = Math.min(relation.size(), m_i * k);
    ArrayDBIDs medoids = greedy(distFunc, sampleSet, medoidSize, random);
    if (LOG.isDebugging()) {
        LOG.debugFine(// 
        new StringBuilder().append("sampleSize ").append(sampleSize).append('\n').append("sampleSet ").append(sampleSet).append(// 
        '\n').append("medoidSize ").append(medoidSize).append(// 
        '\n').append("m ").append(medoids).toString());
    }
    // iterative phase
    if (LOG.isVerbose()) {
        LOG.verbose("2. Iterative phase...");
    }
    double bestObjective = Double.POSITIVE_INFINITY;
    ArrayDBIDs m_best = null;
    DBIDs m_bad = null;
    ArrayDBIDs m_current = initialSet(medoids, k, random);
    if (LOG.isDebugging()) {
        LOG.debugFine(new StringBuilder().append("m_c ").append(m_current).toString());
    }
    IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Current number of clusters:", LOG) : null;
    ArrayList<PROCLUSCluster> clusters = null;
    int loops = 0;
    while (loops < 10) {
        long[][] dimensions = findDimensions(m_current, relation, distFunc, rangeQuery);
        clusters = assignPoints(m_current, dimensions, relation);
        double objectiveFunction = evaluateClusters(clusters, dimensions, relation);
        if (objectiveFunction < bestObjective) {
            // restart counting loops
            loops = 0;
            bestObjective = objectiveFunction;
            m_best = m_current;
            m_bad = computeBadMedoids(m_current, clusters, (int) (relation.size() * 0.1 / k));
        }
        m_current = computeM_current(medoids, m_best, m_bad, random);
        loops++;
        if (cprogress != null) {
            cprogress.setProcessed(clusters.size(), LOG);
        }
    }
    LOG.setCompleted(cprogress);
    // refinement phase
    if (LOG.isVerbose()) {
        LOG.verbose("3. Refinement phase...");
    }
    List<Pair<double[], long[]>> dimensions = findDimensions(clusters, relation);
    List<PROCLUSCluster> finalClusters = finalAssignment(dimensions, relation);
    // build result
    int numClusters = 1;
    Clustering<SubspaceModel> result = new Clustering<>("ProClus clustering", "proclus-clustering");
    for (PROCLUSCluster c : finalClusters) {
        Cluster<SubspaceModel> cluster = new Cluster<>(c.objectIDs);
        cluster.setModel(new SubspaceModel(new Subspace(c.getDimensions()), c.centroid));
        cluster.setName("cluster_" + numClusters++);
        result.addToplevelCluster(cluster);
    }
    return result;
}
Also used : ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) AbstractProjectedClustering(de.lmu.ifi.dbs.elki.algorithm.clustering.AbstractProjectedClustering) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) Random(java.util.Random) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) Pair(de.lmu.ifi.dbs.elki.utilities.pairs.Pair)

Example 10 with SubspaceModel

use of de.lmu.ifi.dbs.elki.data.model.SubspaceModel in project elki by elki-project.

the class CLIQUE method run.

/**
 * Performs the CLIQUE algorithm on the given database.
 *
 * @param relation Data relation to process
 * @return Clustering result
 */
public Clustering<SubspaceModel> run(Relation<V> relation) {
    final int dimensionality = RelationUtil.dimensionality(relation);
    StepProgress step = new StepProgress(2);
    // 1. Identification of subspaces that contain clusters
    step.beginStep(1, "Identification of subspaces that contain clusters", LOG);
    ArrayList<List<CLIQUESubspace<V>>> dimensionToDenseSubspaces = new ArrayList<>(dimensionality);
    List<CLIQUESubspace<V>> denseSubspaces = findOneDimensionalDenseSubspaces(relation);
    dimensionToDenseSubspaces.add(denseSubspaces);
    if (LOG.isVerbose()) {
        LOG.verbose("1-dimensional dense subspaces: " + denseSubspaces.size());
    }
    if (LOG.isDebugging()) {
        for (CLIQUESubspace<V> s : denseSubspaces) {
            LOG.debug(s.toString(" "));
        }
    }
    for (int k = 2; k <= dimensionality && !denseSubspaces.isEmpty(); k++) {
        denseSubspaces = findDenseSubspaces(relation, denseSubspaces);
        assert (dimensionToDenseSubspaces.size() == k - 1);
        dimensionToDenseSubspaces.add(denseSubspaces);
        if (LOG.isVerbose()) {
            LOG.verbose(k + "-dimensional dense subspaces: " + denseSubspaces.size());
        }
        if (LOG.isDebugging()) {
            for (CLIQUESubspace<V> s : denseSubspaces) {
                LOG.debug(s.toString(" "));
            }
        }
    }
    // 2. Identification of clusters
    step.beginStep(2, "Identification of clusters", LOG);
    // build result
    Clustering<SubspaceModel> result = new Clustering<>("CLIQUE clustering", "clique-clustering");
    for (int dim = 0; dim < dimensionToDenseSubspaces.size(); dim++) {
        List<CLIQUESubspace<V>> subspaces = dimensionToDenseSubspaces.get(dim);
        List<Pair<Subspace, ModifiableDBIDs>> modelsAndClusters = determineClusters(subspaces);
        if (LOG.isVerbose()) {
            LOG.verbose((dim + 1) + "-dimensional clusters: " + modelsAndClusters.size());
        }
        for (Pair<Subspace, ModifiableDBIDs> modelAndCluster : modelsAndClusters) {
            Cluster<SubspaceModel> newCluster = new Cluster<>(modelAndCluster.second);
            newCluster.setModel(new SubspaceModel(modelAndCluster.first, Centroid.make(relation, modelAndCluster.second).getArrayRef()));
            result.addToplevelCluster(newCluster);
        }
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) CLIQUESubspace(de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUESubspace) CLIQUESubspace(de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUESubspace) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) ArrayList(java.util.ArrayList) List(java.util.List) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Pair(de.lmu.ifi.dbs.elki.utilities.pairs.Pair)

Aggregations

SubspaceModel (de.lmu.ifi.dbs.elki.data.model.SubspaceModel)16 Subspace (de.lmu.ifi.dbs.elki.data.Subspace)9 AbstractClusterAlgorithmTest (de.lmu.ifi.dbs.elki.algorithm.clustering.AbstractClusterAlgorithmTest)7 Database (de.lmu.ifi.dbs.elki.database.Database)7 Test (org.junit.Test)7 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)6 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)5 ELKIBuilder (de.lmu.ifi.dbs.elki.utilities.ELKIBuilder)5 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)4 ArrayList (java.util.ArrayList)4 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)3 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)3 StepProgress (de.lmu.ifi.dbs.elki.logging.progress.StepProgress)3 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)2 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)2 Pair (de.lmu.ifi.dbs.elki.utilities.pairs.Pair)2 List (java.util.List)2 AbstractProjectedClustering (de.lmu.ifi.dbs.elki.algorithm.clustering.AbstractProjectedClustering)1 MultivariateGaussianModel (de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel)1 CLIQUESubspace (de.lmu.ifi.dbs.elki.algorithm.clustering.subspace.clique.CLIQUESubspace)1