Search in sources :

Example 1 with MultivariateGaussianModel

use of de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel in project elki by elki-project.

the class P3C method run.

/**
 * Performs the P3C algorithm on the given Database.
 */
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    // Overall progress.
    StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
    if (stepProgress != null) {
        stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
    }
    // Desired number of bins, as per Sturge:
    final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
    // Perform 1-dimensional projections, and split into bins.
    SetDBIDs[][] partitions = partitionData(relation, binCount);
    if (stepProgress != null) {
        stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
    }
    // Set markers for each attribute until they're all deemed uniform.
    final long[][] markers = new long[dim][];
    for (int d = 0; d < dim; d++) {
        final SetDBIDs[] parts = partitions[d];
        if (parts == null) {
            // Never mark any on constant dimensions.
            continue;
        }
        final long[] marked = markers[d] = BitsUtil.zero(binCount);
        int card = 0;
        while (card < dim - 1) {
            // Find bin with largest support, test only the dimensions that were not
            // previously marked.
            int bestBin = chiSquaredUniformTest(parts, marked, card);
            if (bestBin < 0) {
                // Uniform
                break;
            }
            BitsUtil.setI(marked, bestBin);
            card++;
        }
        if (LOG.isDebugging()) {
            LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
        }
    }
    if (stepProgress != null) {
        stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
    }
    ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
    if (stepProgress != null) {
        stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
    }
    ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
    if (stepProgress != null) {
        stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
    }
    clusterCores = pruneRedundantClusterCores(clusterCores);
    if (LOG.isVerbose()) {
        LOG.verbose("Number of cluster cores found: " + clusterCores.size());
    }
    if (clusterCores.isEmpty()) {
        LOG.setCompleted(stepProgress);
        Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
        c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
        return c;
    }
    if (stepProgress != null) {
        stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
    }
    // Track objects not assigned to any cluster:
    ModifiableDBIDs noise = DBIDUtil.newHashSet();
    WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
    int k = clusterCores.size();
    List<MultivariateGaussianModel> models = new ArrayList<>(k);
    computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
    // Initial estimate of covariances, to assign noise objects
    EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
    assignUnassigned(relation, probClusterIGivenX, models, noise);
    double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
    for (int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
        final double emOld = emNew;
        EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
        // reassign probabilities
        emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
        if (LOG.isVerbose()) {
            LOG.verbose("iteration " + it + " - expectation value: " + emNew);
        }
        if ((emNew - emOld) <= emDelta) {
            break;
        }
    }
    if (stepProgress != null) {
        stepProgress.beginStep(6, "Generating hard clustering.", LOG);
    }
    // Create a hard clustering, making sure each data point only is part of one
    // cluster, based on the best match from the membership matrix.
    ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
    if (stepProgress != null) {
        stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
    }
    // Outlier detection. Remove points from clusters that have a Mahalanobis
    // distance larger than the critical value of the ChiSquare distribution.
    findOutliers(relation, models, clusterCandidates, noise);
    if (stepProgress != null) {
        stepProgress.beginStep(8, "Removing empty clusters.", LOG);
    }
    // Remove near-empty clusters.
    for (Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext(); ) {
        ClusterCandidate cand = it.next();
        final int size = cand.ids.size();
        if (size < minClusterSize) {
            if (size > 0) {
                noise.addDBIDs(cand.ids);
            }
            it.remove();
        }
    }
    if (LOG.isVerbose()) {
        LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
    }
    if (stepProgress != null) {
        stepProgress.beginStep(9, "Generating final result.", LOG);
    }
    // Generate final output.
    Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
    for (int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
        ClusterCandidate candidate = clusterCandidates.get(cluster);
        CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
        result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
    }
    LOG.verbose("Noise size: " + noise.size());
    if (noise.size() > 0) {
        result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
    }
    LOG.ensureCompleted(stepProgress);
    return result;
}
Also used : ArrayList(java.util.ArrayList) MultivariateGaussianModel(de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) SetDBIDs(de.lmu.ifi.dbs.elki.database.ids.SetDBIDs) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 2 with MultivariateGaussianModel

use of de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel in project elki by elki-project.

the class P3C method computeFuzzyMembership.

/**
 * Computes a fuzzy membership with the weights based on which cluster cores
 * each data point is part of.
 *
 * @param relation Data relation
 * @param clusterCores the cluster cores.
 * @param unassigned set to which to add unassigned points.
 * @param probClusterIGivenX Membership probabilities.
 * @param models Cluster models.
 * @param dim Dimensionality
 */
private void computeFuzzyMembership(Relation<V> relation, ArrayList<Signature> clusterCores, ModifiableDBIDs unassigned, WritableDataStore<double[]> probClusterIGivenX, List<MultivariateGaussianModel> models, int dim) {
    final int n = relation.size();
    // Weight of each point
    final double pweight = 1. / n;
    final int k = clusterCores.size();
    double[] clusterWeights = new double[k];
    for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
        int count = 0;
        double[] weights = new double[k];
        for (int cluster = 0; cluster < k; ++cluster) {
            if (clusterCores.get(cluster).ids.contains(iter)) {
                weights[cluster] = 1.;
                ++count;
            }
        }
        // Set value(s) in membership matrix.
        if (count > 0) {
            // Rescale.
            VMath.timesEquals(weights, 1. / count);
            VMath.plusTimesEquals(clusterWeights, weights, pweight);
        } else {
            // Does not match any cluster, mark it.
            unassigned.add(iter);
        }
        probClusterIGivenX.put(iter, weights);
    }
    for (int i = 0; i < k; i++) {
        models.add(new MultivariateGaussianModel(clusterWeights[i], new double[dim]));
    }
}
Also used : MultivariateGaussianModel(de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 3 with MultivariateGaussianModel

use of de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel in project elki by elki-project.

the class P3C method assignUnassigned.

/**
 * Assign unassigned objects to best candidate based on shortest Mahalanobis
 * distance.
 *
 * @param relation Data relation
 * @param probClusterIGivenX fuzzy membership matrix.
 * @param models Cluster models.
 * @param unassigned the list of points not yet assigned.
 */
private void assignUnassigned(Relation<V> relation, WritableDataStore<double[]> probClusterIGivenX, List<MultivariateGaussianModel> models, ModifiableDBIDs unassigned) {
    if (unassigned.size() == 0) {
        return;
    }
    final int k = models.size();
    double pweight = 1. / relation.size();
    // Rescale weights, to take unassigned points into account:
    for (EMClusterModel<?> m : models) {
        m.setWeight(m.getWeight() * (relation.size() - unassigned.size()) * pweight);
    }
    // Assign noise objects, increase weights accordingly.
    for (DBIDIter iter = unassigned.iter(); iter.valid(); iter.advance()) {
        // Find the best matching known cluster core using the Mahalanobis
        // distance.
        V v = relation.get(iter);
        int bestCluster = -1;
        MultivariateGaussianModel bestModel = null;
        double minDistance = Double.POSITIVE_INFINITY;
        int c = 0;
        for (MultivariateGaussianModel model : models) {
            final double distance = model.mahalanobisDistance(v);
            if (distance < minDistance) {
                minDistance = distance;
                bestCluster = c;
                bestModel = model;
            }
            c++;
        }
        // Assign to best core.
        double[] weights = new double[k];
        weights[bestCluster] = 1.;
        bestModel.setWeight(bestModel.getWeight() + pweight);
        probClusterIGivenX.put(iter, weights);
    }
    // Clear the list of unassigned objects.
    unassigned.clear();
}
Also used : MultivariateGaussianModel(de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 4 with MultivariateGaussianModel

use of de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel in project elki by elki-project.

the class P3C method findOutliers.

/**
 * Performs outlier detection by testing the Mahalanobis distance of each
 * point in a cluster against the critical value of the ChiSquared
 * distribution with as many degrees of freedom as the cluster has relevant
 * attributes.
 *
 * @param relation Data relation
 * @param models Cluster models
 * @param clusterCandidates the list of clusters to check.
 * @param noise the set to which to add points deemed outliers.
 */
private void findOutliers(Relation<V> relation, List<MultivariateGaussianModel> models, ArrayList<ClusterCandidate> clusterCandidates, ModifiableDBIDs noise) {
    Iterator<MultivariateGaussianModel> it = models.iterator();
    for (int c = 0; it.hasNext(); c++) {
        MultivariateGaussianModel model = it.next();
        final ClusterCandidate candidate = clusterCandidates.get(c);
        final int dof = BitsUtil.cardinality(candidate.dimensions);
        final double threshold = ChiSquaredDistribution.quantile(1 - alpha, dof);
        for (DBIDMIter iter = candidate.ids.iter(); iter.valid(); iter.advance()) {
            final double distance = model.mahalanobisDistance(relation.get(iter));
            if (distance >= threshold) {
                // Outlier, remove it and add it to the outlier set.
                noise.add(iter);
                iter.remove();
            }
        }
    }
}
Also used : MultivariateGaussianModel(de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel) DBIDMIter(de.lmu.ifi.dbs.elki.database.ids.DBIDMIter)

Aggregations

MultivariateGaussianModel (de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel)4 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)2 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)1 Subspace (de.lmu.ifi.dbs.elki.data.Subspace)1 SubspaceModel (de.lmu.ifi.dbs.elki.data.model.SubspaceModel)1 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)1 DBIDMIter (de.lmu.ifi.dbs.elki.database.ids.DBIDMIter)1 HashSetModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs)1 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)1 SetDBIDs (de.lmu.ifi.dbs.elki.database.ids.SetDBIDs)1 StepProgress (de.lmu.ifi.dbs.elki.logging.progress.StepProgress)1 CovarianceMatrix (de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix)1 ArrayList (java.util.ArrayList)1