Search in sources :

Example 6 with Cluster

use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.

the class DistanceStatisticsWithClasses method run.

@Override
public HistogramResult run(Database database) {
    final Relation<O> relation = database.getRelation(getInputTypeRestriction()[0]);
    final DistanceQuery<O> distFunc = database.getDistanceQuery(relation, getDistanceFunction());
    final StepProgress stepprog = LOG.isVerbose() ? new StepProgress("Distance statistics", 2) : null;
    // determine binning ranges.
    DoubleMinMax gminmax = new DoubleMinMax();
    // Cluster by labels
    Collection<Cluster<Model>> split = (new ByLabelOrAllInOneClustering()).run(database).getAllClusters();
    // global in-cluster min/max
    DoubleMinMax giminmax = new DoubleMinMax();
    // global other-cluster min/max
    DoubleMinMax gominmax = new DoubleMinMax();
    // in-cluster distances
    MeanVariance mimin = new MeanVariance();
    MeanVariance mimax = new MeanVariance();
    MeanVariance midif = new MeanVariance();
    // other-cluster distances
    MeanVariance momin = new MeanVariance();
    MeanVariance momax = new MeanVariance();
    MeanVariance modif = new MeanVariance();
    // Histogram
    final ObjHistogram<long[]> histogram;
    LOG.beginStep(stepprog, 1, "Prepare histogram.");
    if (exact) {
        gminmax = exactMinMax(relation, distFunc);
        histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2);
    } else if (sampling) {
        gminmax = sampleMinMax(relation, distFunc);
        histogram = new LongArrayStaticHistogram(numbin, gminmax.getMin(), gminmax.getMax(), 2);
    } else {
        histogram = new AbstractObjDynamicHistogram<long[]>(numbin) {

            @Override
            protected long[] downsample(Object[] data, int start, int end, int size) {
                long[] ret = new long[2];
                for (int i = start; i < end; i++) {
                    long[] existing = (long[]) data[i];
                    if (existing != null) {
                        for (int c = 0; c < 2; c++) {
                            ret[c] += existing[c];
                        }
                    }
                }
                return ret;
            }

            @Override
            protected long[] aggregate(long[] first, long[] second) {
                for (int c = 0; c < 2; c++) {
                    first[c] += second[c];
                }
                return first;
            }

            @Override
            protected long[] cloneForCache(long[] data) {
                return data.clone();
            }

            @Override
            protected long[] makeObject() {
                return new long[2];
            }
        };
    }
    LOG.beginStep(stepprog, 2, "Build histogram.");
    final FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Distance computations", relation.size(), LOG) : null;
    // iterate per cluster
    final long[] incFirst = new long[] { 1L, 0L };
    final long[] incSecond = new long[] { 0L, 1L };
    for (Cluster<?> c1 : split) {
        for (DBIDIter id1 = c1.getIDs().iter(); id1.valid(); id1.advance()) {
            // in-cluster distances
            DoubleMinMax iminmax = new DoubleMinMax();
            for (DBIDIter iter2 = c1.getIDs().iter(); iter2.valid(); iter2.advance()) {
                // skip the point itself.
                if (DBIDUtil.equal(id1, iter2)) {
                    continue;
                }
                double d = distFunc.distance(id1, iter2);
                histogram.putData(d, incFirst);
                iminmax.put(d);
            }
            // aggregate
            mimin.put(iminmax.getMin());
            mimax.put(iminmax.getMax());
            midif.put(iminmax.getDiff());
            // min/max
            giminmax.put(iminmax.getMin());
            giminmax.put(iminmax.getMax());
            // other-cluster distances
            DoubleMinMax ominmax = new DoubleMinMax();
            for (Cluster<?> c2 : split) {
                if (c2 == c1) {
                    continue;
                }
                for (DBIDIter iter2 = c2.getIDs().iter(); iter2.valid(); iter2.advance()) {
                    // skip the point itself (shouldn't happen though)
                    if (DBIDUtil.equal(id1, iter2)) {
                        continue;
                    }
                    double d = distFunc.distance(id1, iter2);
                    histogram.putData(d, incSecond);
                    ominmax.put(d);
                }
            }
            // aggregate
            momin.put(ominmax.getMin());
            momax.put(ominmax.getMax());
            modif.put(ominmax.getDiff());
            // min/max
            gominmax.put(ominmax.getMin());
            gominmax.put(ominmax.getMax());
            LOG.incrementProcessed(progress);
        }
    }
    LOG.ensureCompleted(progress);
    // Update values (only needed for sampling case).
    gminmax.put(gominmax);
    LOG.setCompleted(stepprog);
    // count the number of samples we have in the data
    long inum = 0;
    long onum = 0;
    for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
        inum += iter.getValue()[0];
        onum += iter.getValue()[1];
    }
    long bnum = inum + onum;
    Collection<double[]> binstat = new ArrayList<>(numbin);
    for (ObjHistogram.Iter<long[]> iter = histogram.iter(); iter.valid(); iter.advance()) {
        final long[] value = iter.getValue();
        final double icof = (inum == 0) ? 0 : ((double) value[0]) / inum / histogram.getBinsize();
        final double icaf = ((double) value[0]) / bnum / histogram.getBinsize();
        final double ocof = (onum == 0) ? 0 : ((double) value[1]) / onum / histogram.getBinsize();
        final double ocaf = ((double) value[1]) / bnum / histogram.getBinsize();
        binstat.add(new double[] { iter.getCenter(), icof, icaf, ocof, ocaf });
    }
    HistogramResult result = new HistogramResult("Distance Histogram", "distance-histogram", binstat);
    result.addHeader("Absolute minimum distance (abs): " + gminmax.getMin());
    result.addHeader("Absolute maximum distance (abs): " + gminmax.getMax());
    result.addHeader("In-Cluster minimum distance (abs, avg, stddev): " + giminmax.getMin() + " " + mimin.getMean() + " " + mimin.getSampleStddev());
    result.addHeader("In-Cluster maximum distance (abs, avg, stddev): " + giminmax.getMax() + " " + mimax.getMean() + " " + mimax.getSampleStddev());
    result.addHeader("Other-Cluster minimum distance (abs, avg, stddev): " + gominmax.getMin() + " " + momin.getMean() + " " + momin.getSampleStddev());
    result.addHeader("Other-Cluster maximum distance (abs, avg, stddev): " + gominmax.getMax() + " " + momax.getMean() + " " + momax.getSampleStddev());
    result.addHeader("Column description: bin center, in-cluster only frequency, in-cluster all frequency, other-cluster only frequency, other cluster all frequency");
    result.addHeader("In-cluster value count: " + inum + " other cluster value count: " + onum);
    return result;
}
Also used : ObjHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.ObjHistogram) HistogramResult(de.lmu.ifi.dbs.elki.result.HistogramResult) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) AbstractObjDynamicHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.AbstractObjDynamicHistogram) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) ByLabelOrAllInOneClustering(de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) LongArrayStaticHistogram(de.lmu.ifi.dbs.elki.utilities.datastructures.histogram.LongArrayStaticHistogram) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) DoubleMinMax(de.lmu.ifi.dbs.elki.math.DoubleMinMax)

Example 7 with Cluster

use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.

the class AbstractBiclustering method defineBicluster.

/**
 * Defines a Bicluster as given by the included rows and columns.
 *
 * @param rows the rows included in the Bicluster
 * @param cols the columns included in the Bicluster
 * @return A Bicluster as given by the included rows and columns
 */
protected Cluster<BiclusterModel> defineBicluster(long[] rows, long[] cols) {
    ArrayDBIDs rowIDs = rowsBitsetToIDs(rows);
    int[] colIDs = colsBitsetToIDs(cols);
    return new Cluster<>(rowIDs, new BiclusterModel(colIDs));
}
Also used : ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) BiclusterModel(de.lmu.ifi.dbs.elki.data.model.BiclusterModel)

Example 8 with Cluster

use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.

the class AbstractBiclustering method defineBicluster.

/**
 * Defines a Bicluster as given by the included rows and columns.
 *
 * @param rows the rows included in the Bicluster
 * @param cols the columns included in the Bicluster
 * @return a Bicluster as given by the included rows and columns
 */
protected Cluster<BiclusterModel> defineBicluster(BitSet rows, BitSet cols) {
    ArrayDBIDs rowIDs = rowsBitsetToIDs(rows);
    int[] colIDs = colsBitsetToIDs(cols);
    return new Cluster<>(rowIDs, new BiclusterModel(colIDs));
}
Also used : ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) BiclusterModel(de.lmu.ifi.dbs.elki.data.model.BiclusterModel)

Example 9 with Cluster

use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.

the class ERiC method extractCorrelationClusters.

/**
 * Extracts the correlation clusters and noise from the copac result and
 * returns a mapping of correlation dimension to maps of clusters within this
 * correlation dimension. Each cluster is defined by the basis vectors
 * defining the subspace in which the cluster appears.
 *
 * @param dbscanResult
 *
 * @param relation the database containing the objects
 * @param dimensionality the dimensionality of the feature space
 * @param npred ERiC predicate
 * @return a list of clusters for each dimensionality
 */
private List<List<Cluster<CorrelationModel>>> extractCorrelationClusters(Clustering<Model> dbscanResult, Relation<V> relation, int dimensionality, ERiCNeighborPredicate<V>.Instance npred) {
    // result
    List<List<Cluster<CorrelationModel>>> clusterMap = new ArrayList<>();
    for (int i = 0; i <= dimensionality; i++) {
        clusterMap.add(new ArrayList<Cluster<CorrelationModel>>());
    }
    // noise cluster containing all noise objects over all partitions
    Cluster<Model> noise = null;
    // iterate over correlation dimensions
    for (Cluster<Model> clus : dbscanResult.getAllClusters()) {
        DBIDs group = clus.getIDs();
        int dim = clus.isNoise() ? dimensionality : npred.dimensionality(clus.getIDs().iter());
        if (dim < dimensionality) {
            EigenPairFilter filter = new FirstNEigenPairFilter(dim);
            // get cluster list for this dimension.
            List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(dim);
            SortedEigenPairs epairs = settings.pca.processIds(group, relation).getEigenPairs();
            int numstrong = filter.filter(epairs.eigenValues());
            PCAFilteredResult pcares = new PCAFilteredResult(epairs, numstrong, 1., 0.);
            double[] centroid = Centroid.make(relation, group).getArrayRef();
            Cluster<CorrelationModel> correlationCluster = new Cluster<>("[" + dim + "_" + correlationClusters.size() + "]", group, new CorrelationModel(pcares, centroid));
            correlationClusters.add(correlationCluster);
        } else // partition containing noise
        {
            if (noise == null) {
                noise = clus;
            } else {
                ModifiableDBIDs merged = DBIDUtil.newHashSet(noise.getIDs());
                merged.addDBIDs(clus.getIDs());
                noise.setIDs(merged);
            }
        }
    }
    if (noise != null && noise.size() > 0) {
        // get cluster list for this dimension.
        List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(dimensionality);
        EigenPairFilter filter = new FirstNEigenPairFilter(dimensionality);
        SortedEigenPairs epairs = settings.pca.processIds(noise.getIDs(), relation).getEigenPairs();
        int numstrong = filter.filter(epairs.eigenValues());
        PCAFilteredResult pcares = new PCAFilteredResult(epairs, numstrong, 1., 0.);
        double[] centroid = Centroid.make(relation, noise.getIDs()).getArrayRef();
        Cluster<CorrelationModel> correlationCluster = new Cluster<>("[noise]", noise.getIDs(), new CorrelationModel(pcares, centroid));
        correlationClusters.add(correlationCluster);
    }
    // Delete dimensionalities not found.
    for (int i = dimensionality; i > 0; i--) {
        if (!clusterMap.get(i).isEmpty()) {
            break;
        }
        clusterMap.remove(i);
    }
    return clusterMap;
}
Also used : EigenPairFilter(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.filter.EigenPairFilter) FirstNEigenPairFilter(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.filter.FirstNEigenPairFilter) PercentageEigenPairFilter(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.filter.PercentageEigenPairFilter) FirstNEigenPairFilter(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.filter.FirstNEigenPairFilter) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) ArrayList(java.util.ArrayList) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) CorrelationModel(de.lmu.ifi.dbs.elki.data.model.CorrelationModel) CorrelationModel(de.lmu.ifi.dbs.elki.data.model.CorrelationModel) Model(de.lmu.ifi.dbs.elki.data.model.Model) SortedEigenPairs(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.SortedEigenPairs) ArrayList(java.util.ArrayList) List(java.util.List) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) PCAFilteredResult(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredResult)

Example 10 with Cluster

use of de.lmu.ifi.dbs.elki.data.Cluster in project elki by elki-project.

the class ERiC method run.

/**
 * Performs the ERiC algorithm on the given database.
 *
 * @param relation Relation to process
 * @return Clustering result
 */
public Clustering<CorrelationModel> run(Database database, Relation<V> relation) {
    final int dimensionality = RelationUtil.dimensionality(relation);
    StepProgress stepprog = LOG.isVerbose() ? new StepProgress(3) : null;
    // Run Generalized DBSCAN
    LOG.beginStep(stepprog, 1, "Preprocessing local correlation dimensionalities and partitioning data");
    // FIXME: how to ensure we are running on the same relation?
    ERiCNeighborPredicate<V>.Instance npred = new ERiCNeighborPredicate<V>(settings).instantiate(database, relation);
    CorePredicate.Instance<DBIDs> cpred = new MinPtsCorePredicate(settings.minpts).instantiate(database);
    Clustering<Model> copacResult = new GeneralizedDBSCAN.Instance<>(npred, cpred, false).run();
    // extract correlation clusters
    LOG.beginStep(stepprog, 2, "Extract correlation clusters");
    List<List<Cluster<CorrelationModel>>> clusterMap = extractCorrelationClusters(copacResult, relation, dimensionality, npred);
    if (LOG.isDebugging()) {
        StringBuilder msg = new StringBuilder("Step 2: Extract correlation clusters...");
        for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
            List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(corrDim);
            msg.append("\n\ncorrDim ").append(corrDim);
            for (Cluster<CorrelationModel> cluster : correlationClusters) {
                msg.append("\n  cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
            // .append(", level: ").append(cluster.getLevel()).append(", index:
            // ").append(cluster.getLevelIndex());
            // msg.append("\n basis " +
            // cluster.getPCA().getWeakEigenvectors().toString(" ", NF) +
            // " ids " + cluster.getIDs().size());
            }
        }
        LOG.debugFine(msg.toString());
    }
    if (LOG.isVerbose()) {
        int clusters = 0;
        for (List<Cluster<CorrelationModel>> correlationClusters : clusterMap) {
            clusters += correlationClusters.size();
        }
        LOG.verbose(clusters + " clusters extracted.");
    }
    // build hierarchy
    LOG.beginStep(stepprog, 3, "Building hierarchy");
    Clustering<CorrelationModel> clustering = new Clustering<>("ERiC clustering", "eric-clustering");
    buildHierarchy(clustering, clusterMap, npred);
    if (LOG.isDebugging()) {
        StringBuilder msg = new StringBuilder("Step 3: Build hierarchy");
        for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
            List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(corrDim);
            for (Cluster<CorrelationModel> cluster : correlationClusters) {
                msg.append("\n  cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
                // ").append(cluster.getLevelIndex());
                for (It<Cluster<CorrelationModel>> iter = clustering.getClusterHierarchy().iterParents(cluster); iter.valid(); iter.advance()) {
                    msg.append("\n   parent ").append(iter.get());
                }
                for (It<Cluster<CorrelationModel>> iter = clustering.getClusterHierarchy().iterChildren(cluster); iter.valid(); iter.advance()) {
                    msg.append("\n   child ").append(iter.get());
                }
            }
        }
        LOG.debugFine(msg.toString());
    }
    LOG.setCompleted(stepprog);
    for (Cluster<CorrelationModel> rc : clusterMap.get(clusterMap.size() - 1)) {
        clustering.addToplevelCluster(rc);
    }
    return clustering;
}
Also used : ERiCNeighborPredicate(de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.ERiCNeighborPredicate) MinPtsCorePredicate(de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.MinPtsCorePredicate) CorePredicate(de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) MinPtsCorePredicate(de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.MinPtsCorePredicate) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) CorrelationModel(de.lmu.ifi.dbs.elki.data.model.CorrelationModel) CorrelationModel(de.lmu.ifi.dbs.elki.data.model.CorrelationModel) Model(de.lmu.ifi.dbs.elki.data.model.Model) GeneralizedDBSCAN(de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.GeneralizedDBSCAN) ArrayList(java.util.ArrayList) List(java.util.List)

Aggregations

Cluster (de.lmu.ifi.dbs.elki.data.Cluster)38 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)21 Model (de.lmu.ifi.dbs.elki.data.model.Model)18 ArrayList (java.util.ArrayList)14 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)13 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)11 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)11 SubspaceModel (de.lmu.ifi.dbs.elki.data.model.SubspaceModel)8 Subspace (de.lmu.ifi.dbs.elki.data.Subspace)7 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)7 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)6 StepProgress (de.lmu.ifi.dbs.elki.logging.progress.StepProgress)6 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)5 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)4 HashMap (java.util.HashMap)4 ByLabelOrAllInOneClustering (de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelOrAllInOneClustering)3 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)3 ProxyDatabase (de.lmu.ifi.dbs.elki.database.ProxyDatabase)3 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)3 HashSetModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs)3