Search in sources :

Example 6 with Centroid

use of de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid in project elki by elki-project.

the class NaiveMeanShiftClustering method run.

/**
 * Run the mean-shift clustering algorithm.
 *
 * @param database Database
 * @param relation Data relation
 * @return Clustering result
 */
public Clustering<MeanModel> run(Database database, Relation<V> relation) {
    final DistanceQuery<V> distq = database.getDistanceQuery(relation, getDistanceFunction());
    final RangeQuery<V> rangeq = database.getRangeQuery(distq);
    final NumberVector.Factory<V> factory = RelationUtil.getNumberVectorFactory(relation);
    final int dim = RelationUtil.dimensionality(relation);
    // Stopping threshold
    final double threshold = bandwidth * 1E-10;
    // Result store:
    ArrayList<Pair<V, ModifiableDBIDs>> clusters = new ArrayList<>();
    ModifiableDBIDs noise = DBIDUtil.newArray();
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Mean-shift clustering", relation.size(), LOG) : null;
    for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
        // Initial position:
        V position = relation.get(iter);
        iterations: for (int j = 1; j <= MAXITER; j++) {
            // Compute new position:
            V newvec = null;
            {
                DoubleDBIDList neigh = rangeq.getRangeForObject(position, bandwidth);
                boolean okay = (neigh.size() > 1) || (neigh.size() >= 1 && j > 1);
                if (okay) {
                    Centroid newpos = new Centroid(dim);
                    for (DoubleDBIDListIter niter = neigh.iter(); niter.valid(); niter.advance()) {
                        final double weight = kernel.density(niter.doubleValue() / bandwidth);
                        newpos.put(relation.get(niter), weight);
                    }
                    newvec = factory.newNumberVector(newpos.getArrayRef());
                // TODO: detect 0 weight!
                }
                if (!okay) {
                    noise.add(iter);
                    break iterations;
                }
            }
            // Test if we are close to one of the known clusters:
            double bestd = Double.POSITIVE_INFINITY;
            Pair<V, ModifiableDBIDs> bestp = null;
            for (Pair<V, ModifiableDBIDs> pair : clusters) {
                final double merged = distq.distance(newvec, pair.first);
                if (merged < bestd) {
                    bestd = merged;
                    bestp = pair;
                }
            }
            // Check for convergence:
            double delta = distq.distance(position, newvec);
            if (bestd < 10 * threshold || bestd * 2 < delta) {
                bestp.second.add(iter);
                break iterations;
            }
            if (j == MAXITER) {
                LOG.warning("No convergence after " + MAXITER + " iterations. Distance: " + delta);
            }
            if (Double.isNaN(delta)) {
                LOG.warning("Encountered NaN distance. Invalid center vector? " + newvec.toString());
                break iterations;
            }
            if (j == MAXITER || delta < threshold) {
                if (LOG.isDebuggingFine()) {
                    LOG.debugFine("New cluster:" + newvec + " delta: " + delta + " threshold: " + threshold + " bestd: " + bestd);
                }
                ArrayModifiableDBIDs cids = DBIDUtil.newArray();
                cids.add(iter);
                clusters.add(new Pair<V, ModifiableDBIDs>(newvec, cids));
                break iterations;
            }
            position = newvec;
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    ArrayList<Cluster<MeanModel>> cs = new ArrayList<>(clusters.size());
    for (Pair<V, ModifiableDBIDs> pair : clusters) {
        cs.add(new Cluster<>(pair.second, new MeanModel(pair.first.toArray())));
    }
    if (noise.size() > 0) {
        cs.add(new Cluster<MeanModel>(noise, true));
    }
    Clustering<MeanModel> c = new Clustering<>("Mean-shift Clustering", "mean-shift-clustering", cs);
    return c;
}
Also used : ArrayList(java.util.ArrayList) MeanModel(de.lmu.ifi.dbs.elki.data.model.MeanModel) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList) Pair(de.lmu.ifi.dbs.elki.utilities.pairs.Pair) DoubleDBIDListIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) Centroid(de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 7 with Centroid

use of de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid in project elki by elki-project.

the class EvaluatePBMIndex method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return PBM
 */
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    NumberVector[] centroids = new NumberVector[clusters.size()];
    int ignorednoise = EvaluateSimplifiedSilhouette.centroids(rel, clusters, centroids, noiseHandling);
    // Build global centroid and cluster count:
    final int dim = RelationUtil.dimensionality(rel);
    Centroid overallCentroid = new Centroid(dim);
    EvaluateVarianceRatioCriteria.globalCentroid(overallCentroid, rel, clusters, centroids, noiseHandling);
    // Maximum distance between centroids:
    double max = 0;
    for (int i = 0; i < centroids.length; i++) {
        if (centroids[i] == null && noiseHandling != NoiseHandling.TREAT_NOISE_AS_SINGLETONS) {
            continue;
        }
        for (int j = i + 1; j < centroids.length; j++) {
            if (centroids[j] == null && noiseHandling != NoiseHandling.TREAT_NOISE_AS_SINGLETONS) {
                continue;
            }
            if (centroids[i] == null && centroids[j] == null) {
                // Need to compute pairwise distances of noise clusters.
                for (DBIDIter iti = clusters.get(i).getIDs().iter(); iti.valid(); iti.advance()) {
                    for (DBIDIter itj = clusters.get(j).getIDs().iter(); itj.valid(); itj.advance()) {
                        double dist = distanceFunction.distance(rel.get(iti), rel.get(itj));
                        max = dist > max ? dist : max;
                    }
                }
            } else if (centroids[i] == null) {
                for (DBIDIter iti = clusters.get(i).getIDs().iter(); iti.valid(); iti.advance()) {
                    double dist = distanceFunction.distance(rel.get(iti), centroids[j]);
                    max = dist > max ? dist : max;
                }
            } else if (centroids[j] == null) {
                for (DBIDIter itj = clusters.get(j).getIDs().iter(); itj.valid(); itj.advance()) {
                    double dist = distanceFunction.distance(centroids[i], rel.get(itj));
                    max = dist > max ? dist : max;
                }
            } else {
                double dist = distanceFunction.distance(centroids[i], centroids[j]);
                max = dist > max ? dist : max;
            }
        }
    }
    // a: Distance to own centroid
    // b: Distance to overall centroid
    double a = 0, b = 0;
    Iterator<? extends Cluster<?>> ci = clusters.iterator();
    for (int i = 0; ci.hasNext(); i++) {
        Cluster<?> cluster = ci.next();
        if (cluster.size() <= 1 || cluster.isNoise()) {
            switch(noiseHandling) {
                case IGNORE_NOISE:
                    // Ignored
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    // Singletons: a = 0 by definition.
                    for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
                        b += SquaredEuclideanDistanceFunction.STATIC.distance(overallCentroid, rel.get(it));
                    }
                    // with NEXT cluster.
                    continue;
                case MERGE_NOISE:
                    // Treat like a cluster below:
                    break;
            }
        }
        for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
            NumberVector obj = rel.get(it);
            a += distanceFunction.distance(centroids[i], obj);
            b += distanceFunction.distance(overallCentroid, obj);
        }
    }
    final double pbm = FastMath.pow((1. / centroids.length) * (b / a) * max, 2.);
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".pbm.noise-handling", noiseHandling.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".pbm.ignored", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".pbm", pbm));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("PBM-Index", pbm, 0., Double.POSITIVE_INFINITY, 0., false);
    db.getHierarchy().resultChanged(ev);
    return pbm;
}
Also used : MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) Centroid(de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 8 with Centroid

use of de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid in project elki by elki-project.

the class WeightedCovarianceMatrixBuilder method processIds.

/**
 * Weighted Covariance Matrix for a set of IDs. Since we are not supplied any
 * distance information, we'll need to compute it ourselves. Covariance is
 * tied to Euclidean distance, so it probably does not make much sense to add
 * support for other distance functions?
 *
 * @param ids Database ids to process
 * @param relation Relation to process
 * @return Covariance matrix
 */
@Override
public double[][] processIds(DBIDs ids, Relation<? extends NumberVector> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    final CovarianceMatrix cmat = new CovarianceMatrix(dim);
    final Centroid centroid = Centroid.make(relation, ids);
    // find maximum distance
    double maxdist = 0.0, stddev = 0.0;
    {
        for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
            double distance = weightDistance.distance(centroid, relation.get(iter));
            stddev += distance * distance;
            if (distance > maxdist) {
                maxdist = distance;
            }
        }
        if (maxdist == 0.0) {
            maxdist = 1.0;
        }
        // compute standard deviation.
        stddev = FastMath.sqrt(stddev / ids.size());
    }
    for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
        NumberVector obj = relation.get(iter);
        double distance = weightDistance.distance(centroid, obj);
        double weight = weightfunction.getWeight(distance, maxdist, stddev);
        cmat.put(obj, weight);
    }
    return cmat.destroyToPopulationMatrix();
}
Also used : Centroid(de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 9 with Centroid

use of de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid in project elki by elki-project.

the class LinearDiscriminantAnalysisFilter method computeProjectionMatrix.

@Override
protected double[][] computeProjectionMatrix(List<V> vectorcolumn, List<? extends ClassLabel> classcolumn, int dim) {
    Map<ClassLabel, IntList> classes = partition(classcolumn);
    // Fix indexing of classes:
    List<ClassLabel> keys = new ArrayList<>(classes.keySet());
    // Compute centroids:
    List<Centroid> centroids = computeCentroids(dim, vectorcolumn, keys, classes);
    final double[][] sigmaB, sigmaI;
    // Between classes covariance:
    {
        CovarianceMatrix covmake = new CovarianceMatrix(dim);
        for (Centroid c : centroids) {
            covmake.put(c);
        }
        sigmaB = covmake.destroyToSampleMatrix();
    }
    {
        // (Average) within class variance:
        CovarianceMatrix covmake = new CovarianceMatrix(dim);
        int numc = keys.size();
        for (int i = 0; i < numc; i++) {
            double[] c = centroids.get(i).getArrayRef();
            // TODO: different weighting strategies? Sampling?
            for (IntIterator it = classes.get(keys.get(i)).iterator(); it.hasNext(); ) {
                covmake.put(minusEquals(vectorcolumn.get(it.nextInt()).toArray(), c));
            }
        }
        sigmaI = covmake.destroyToSampleMatrix();
        if (new LUDecomposition(sigmaI).det() == 0) {
            for (int i = 0; i < dim; i++) {
                sigmaI[i][i] += 1e-10;
            }
        }
    }
    double[][] sol = times(inverse(sigmaI), sigmaB);
    EigenvalueDecomposition decomp = new EigenvalueDecomposition(sol);
    SortedEigenPairs sorted = new SortedEigenPairs(decomp, false);
    return transpose(sorted.eigenVectors(tdim));
}
Also used : IntIterator(it.unimi.dsi.fastutil.ints.IntIterator) EigenvalueDecomposition(de.lmu.ifi.dbs.elki.math.linearalgebra.EigenvalueDecomposition) ArrayList(java.util.ArrayList) LUDecomposition(de.lmu.ifi.dbs.elki.math.linearalgebra.LUDecomposition) IntList(it.unimi.dsi.fastutil.ints.IntList) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) Centroid(de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid) ClassLabel(de.lmu.ifi.dbs.elki.data.ClassLabel) SortedEigenPairs(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.SortedEigenPairs)

Aggregations

Centroid (de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid)9 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)6 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)5 ArrayList (java.util.ArrayList)3 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)2 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)2 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)2 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)2 CovarianceMatrix (de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix)2 EvaluationResult (de.lmu.ifi.dbs.elki.result.EvaluationResult)2 MeasurementGroup (de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup)2 IntIterator (it.unimi.dsi.fastutil.ints.IntIterator)2 ClassLabel (de.lmu.ifi.dbs.elki.data.ClassLabel)1 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)1 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)1 MeanModel (de.lmu.ifi.dbs.elki.data.model.MeanModel)1 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)1 DoubleDBIDList (de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList)1 DoubleDBIDListIter (de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter)1 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)1