Search in sources :

Example 31 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class P3C method run.

/**
 * Performs the P3C algorithm on the given Database.
 */
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    // Overall progress.
    StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
    if (stepProgress != null) {
        stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
    }
    // Desired number of bins, as per Sturge:
    final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
    // Perform 1-dimensional projections, and split into bins.
    SetDBIDs[][] partitions = partitionData(relation, binCount);
    if (stepProgress != null) {
        stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
    }
    // Set markers for each attribute until they're all deemed uniform.
    final long[][] markers = new long[dim][];
    for (int d = 0; d < dim; d++) {
        final SetDBIDs[] parts = partitions[d];
        if (parts == null) {
            // Never mark any on constant dimensions.
            continue;
        }
        final long[] marked = markers[d] = BitsUtil.zero(binCount);
        int card = 0;
        while (card < dim - 1) {
            // Find bin with largest support, test only the dimensions that were not
            // previously marked.
            int bestBin = chiSquaredUniformTest(parts, marked, card);
            if (bestBin < 0) {
                // Uniform
                break;
            }
            BitsUtil.setI(marked, bestBin);
            card++;
        }
        if (LOG.isDebugging()) {
            LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
        }
    }
    if (stepProgress != null) {
        stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
    }
    ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
    if (stepProgress != null) {
        stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
    }
    ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
    if (stepProgress != null) {
        stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
    }
    clusterCores = pruneRedundantClusterCores(clusterCores);
    if (LOG.isVerbose()) {
        LOG.verbose("Number of cluster cores found: " + clusterCores.size());
    }
    if (clusterCores.isEmpty()) {
        LOG.setCompleted(stepProgress);
        Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
        c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
        return c;
    }
    if (stepProgress != null) {
        stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
    }
    // Track objects not assigned to any cluster:
    ModifiableDBIDs noise = DBIDUtil.newHashSet();
    WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
    int k = clusterCores.size();
    List<MultivariateGaussianModel> models = new ArrayList<>(k);
    computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
    // Initial estimate of covariances, to assign noise objects
    EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
    assignUnassigned(relation, probClusterIGivenX, models, noise);
    double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
    for (int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
        final double emOld = emNew;
        EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
        // reassign probabilities
        emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
        if (LOG.isVerbose()) {
            LOG.verbose("iteration " + it + " - expectation value: " + emNew);
        }
        if ((emNew - emOld) <= emDelta) {
            break;
        }
    }
    if (stepProgress != null) {
        stepProgress.beginStep(6, "Generating hard clustering.", LOG);
    }
    // Create a hard clustering, making sure each data point only is part of one
    // cluster, based on the best match from the membership matrix.
    ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
    if (stepProgress != null) {
        stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
    }
    // Outlier detection. Remove points from clusters that have a Mahalanobis
    // distance larger than the critical value of the ChiSquare distribution.
    findOutliers(relation, models, clusterCandidates, noise);
    if (stepProgress != null) {
        stepProgress.beginStep(8, "Removing empty clusters.", LOG);
    }
    // Remove near-empty clusters.
    for (Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext(); ) {
        ClusterCandidate cand = it.next();
        final int size = cand.ids.size();
        if (size < minClusterSize) {
            if (size > 0) {
                noise.addDBIDs(cand.ids);
            }
            it.remove();
        }
    }
    if (LOG.isVerbose()) {
        LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
    }
    if (stepProgress != null) {
        stepProgress.beginStep(9, "Generating final result.", LOG);
    }
    // Generate final output.
    Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
    for (int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
        ClusterCandidate candidate = clusterCandidates.get(cluster);
        CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
        result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
    }
    LOG.verbose("Noise size: " + noise.size());
    if (noise.size() > 0) {
        result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
    }
    LOG.ensureCompleted(stepProgress);
    return result;
}
Also used : ArrayList(java.util.ArrayList) MultivariateGaussianModel(de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) SetDBIDs(de.lmu.ifi.dbs.elki.database.ids.SetDBIDs) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 32 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class P3C method mergeSignatures.

/**
 * Generates a merged signature of this and another one, where the other
 * signature must be a 1-signature.
 *
 * @param first First signature.
 * @param second Second signature, must be a 1-signature.
 * @param numBins Number of bins per dimension.
 * @return the merged signature, or null if the merge failed.
 */
protected Signature mergeSignatures(Signature first, Signature second, int numBins) {
    int d2 = -1;
    for (int i = 0; i < second.spec.length; i += 2) {
        if (second.spec[i] >= 0) {
            assert (d2 == -1) : "Merging with non-1-signature?!?";
            d2 = i;
        }
    }
    assert (d2 >= 0) : "Merging with empty signature?";
    // Avoid generating redundant signatures.
    if (first.spec[d2] >= 0) {
        return null;
    }
    // Definition 3, Condition 1:
    // True support:
    final ModifiableDBIDs intersection = DBIDUtil.intersection(first.ids, second.ids);
    final int support = intersection.size();
    // Interval width, computed using selected number of bins / total bins
    double width = (second.spec[d2 + 1] - second.spec[d2] + 1.) / (double) numBins;
    // Expected size thus:
    double expect = first.ids.size() * width;
    if (support <= expect || support < minClusterSize) {
        return null;
    }
    final double test = PoissonDistribution.rawProbability(support, expect);
    if (poissonThreshold <= test) {
        return null;
    }
    // Create merged signature.
    int[] spec = first.spec.clone();
    spec[d2] = second.spec[d2];
    spec[d2 + 1] = second.spec[d2];
    final Signature newsig = new Signature(spec, intersection);
    if (LOG.isDebugging()) {
        LOG.debug(newsig.toString());
    }
    return newsig;
}
Also used : ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 33 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class PROCLUS method computeBadMedoids.

/**
 * Computes the bad medoids, where the medoid of a cluster with less than the
 * specified threshold of objects is bad.
 *
 * @param m_current Current medoids
 * @param clusters the clusters
 * @param threshold the threshold
 * @return the bad medoids
 */
private DBIDs computeBadMedoids(ArrayDBIDs m_current, ArrayList<PROCLUSCluster> clusters, int threshold) {
    ModifiableDBIDs badMedoids = DBIDUtil.newHashSet(m_current.size());
    int i = 0;
    for (DBIDIter it = m_current.iter(); it.valid(); it.advance(), i++) {
        PROCLUSCluster c_i = clusters.get(i);
        if (c_i == null || c_i.objectIDs.size() < threshold) {
            badMedoids.add(it);
        }
    }
    return badMedoids;
}
Also used : ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 34 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class PROCLUS method assignPoints.

/**
 * Assigns the objects to the clusters.
 *
 * @param m_current Current centers
 * @param dimensions set of correlated dimensions for each medoid of the
 *        cluster
 * @param database the database containing the objects
 * @return the assignments of the object to the clusters
 */
private ArrayList<PROCLUSCluster> assignPoints(ArrayDBIDs m_current, long[][] dimensions, Relation<V> database) {
    ModifiableDBIDs[] clusterIDs = new ModifiableDBIDs[dimensions.length];
    for (int i = 0; i < m_current.size(); i++) {
        clusterIDs[i] = DBIDUtil.newHashSet();
    }
    DBIDArrayIter m_i = m_current.iter();
    for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
        V p = database.get(it);
        double minDist = Double.NaN;
        int best = -1, i = 0;
        for (m_i.seek(0); m_i.valid(); m_i.advance(), i++) {
            V m = database.get(m_i);
            double currentDist = manhattanSegmentalDistance(p, m, dimensions[i]);
            if (!(minDist <= currentDist)) {
                minDist = currentDist;
                best = i;
            }
        }
        // add p to cluster with mindist
        assert best >= 0;
        clusterIDs[best].add(it);
    }
    ArrayList<PROCLUSCluster> clusters = new ArrayList<>(m_current.size());
    for (int i = 0; i < dimensions.length; i++) {
        ModifiableDBIDs objectIDs = clusterIDs[i];
        if (!objectIDs.isEmpty()) {
            long[] clusterDimensions = dimensions[i];
            double[] centroid = Centroid.make(database, objectIDs).getArrayRef();
            clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid));
        } else {
            clusters.add(null);
        }
    }
    if (LOG.isDebugging()) {
        LOG.debugFine(new StringBuilder().append("clusters ").append(clusters).toString());
    }
    return clusters;
}
Also used : ArrayList(java.util.ArrayList) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 35 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class PROCLUS method finalAssignment.

/**
 * Refinement step to assign the objects to the final clusters.
 *
 * @param dimensions pair containing the centroid and the set of correlated
 *        dimensions for the centroid
 * @param database the database containing the objects
 * @return the assignments of the object to the clusters
 */
private List<PROCLUSCluster> finalAssignment(List<Pair<double[], long[]>> dimensions, Relation<V> database) {
    Map<Integer, ModifiableDBIDs> clusterIDs = new HashMap<>();
    for (int i = 0; i < dimensions.size(); i++) {
        clusterIDs.put(i, DBIDUtil.newHashSet());
    }
    for (DBIDIter it = database.iterDBIDs(); it.valid(); it.advance()) {
        V p = database.get(it);
        double minDist = Double.POSITIVE_INFINITY;
        int best = -1;
        for (int i = 0; i < dimensions.size(); i++) {
            Pair<double[], long[]> pair_i = dimensions.get(i);
            double currentDist = manhattanSegmentalDistance(p, pair_i.first, pair_i.second);
            if (best < 0 || currentDist < minDist) {
                minDist = currentDist;
                best = i;
            }
        }
        // add p to cluster with mindist
        assert minDist >= 0.;
        clusterIDs.get(best).add(it);
    }
    List<PROCLUSCluster> clusters = new ArrayList<>();
    for (int i = 0; i < dimensions.size(); i++) {
        ModifiableDBIDs objectIDs = clusterIDs.get(i);
        if (!objectIDs.isEmpty()) {
            long[] clusterDimensions = dimensions.get(i).second;
            double[] centroid = Centroid.make(database, objectIDs).getArrayRef();
            clusters.add(new PROCLUSCluster(objectIDs, clusterDimensions, centroid));
        }
    }
    if (LOG.isDebugging()) {
        LOG.debugFine(new StringBuilder().append("clusters ").append(clusters).toString());
    }
    return clusters;
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Aggregations

ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)80 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)44 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)30 ArrayList (java.util.ArrayList)30 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)28 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)18 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)15 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)14 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)14 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)12 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)12 Model (de.lmu.ifi.dbs.elki.data.model.Model)11 DBID (de.lmu.ifi.dbs.elki.database.ids.DBID)11 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)10 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)10 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)9 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)9 HashSetModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs)8 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)8 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)7