Search in sources :

Example 16 with Model

use of de.lmu.ifi.dbs.elki.data.model.Model in project elki by elki-project.

the class CASH method run.

/**
 * Run CASH on the relation.
 *
 * @param database Database
 * @param vrel Relation
 * @return Clustering result
 */
public Clustering<Model> run(Database database, Relation<V> vrel) {
    fulldatabase = preprocess(database, vrel);
    processedIDs = DBIDUtil.newHashSet(fulldatabase.size());
    noiseDim = dimensionality(fulldatabase);
    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("CASH Clustering", fulldatabase.size(), LOG) : null;
    Clustering<Model> result = doRun(fulldatabase, progress);
    LOG.ensureCompleted(progress);
    if (LOG.isVerbose()) {
        StringBuilder msg = new StringBuilder(1000);
        for (Cluster<Model> c : result.getAllClusters()) {
            if (c.getModel() instanceof LinearEquationModel) {
                LinearEquationModel s = (LinearEquationModel) c.getModel();
                msg.append("\n Cluster: Dim: " + s.getLes().subspacedim() + " size: " + c.size());
            } else {
                msg.append("\n Cluster: " + c.getModel().getClass().getName() + " size: " + c.size());
            }
        }
        LOG.verbose(msg.toString());
    }
    return result;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) LinearEquationModel(de.lmu.ifi.dbs.elki.data.model.LinearEquationModel) ClusterModel(de.lmu.ifi.dbs.elki.data.model.ClusterModel) Model(de.lmu.ifi.dbs.elki.data.model.Model) LinearEquationModel(de.lmu.ifi.dbs.elki.data.model.LinearEquationModel)

Example 17 with Model

use of de.lmu.ifi.dbs.elki.data.model.Model in project elki by elki-project.

the class ERiC method extractCorrelationClusters.

/**
 * Extracts the correlation clusters and noise from the copac result and
 * returns a mapping of correlation dimension to maps of clusters within this
 * correlation dimension. Each cluster is defined by the basis vectors
 * defining the subspace in which the cluster appears.
 *
 * @param dbscanResult
 *
 * @param relation the database containing the objects
 * @param dimensionality the dimensionality of the feature space
 * @param npred ERiC predicate
 * @return a list of clusters for each dimensionality
 */
private List<List<Cluster<CorrelationModel>>> extractCorrelationClusters(Clustering<Model> dbscanResult, Relation<V> relation, int dimensionality, ERiCNeighborPredicate<V>.Instance npred) {
    // result
    List<List<Cluster<CorrelationModel>>> clusterMap = new ArrayList<>();
    for (int i = 0; i <= dimensionality; i++) {
        clusterMap.add(new ArrayList<Cluster<CorrelationModel>>());
    }
    // noise cluster containing all noise objects over all partitions
    Cluster<Model> noise = null;
    // iterate over correlation dimensions
    for (Cluster<Model> clus : dbscanResult.getAllClusters()) {
        DBIDs group = clus.getIDs();
        int dim = clus.isNoise() ? dimensionality : npred.dimensionality(clus.getIDs().iter());
        if (dim < dimensionality) {
            EigenPairFilter filter = new FirstNEigenPairFilter(dim);
            // get cluster list for this dimension.
            List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(dim);
            SortedEigenPairs epairs = settings.pca.processIds(group, relation).getEigenPairs();
            int numstrong = filter.filter(epairs.eigenValues());
            PCAFilteredResult pcares = new PCAFilteredResult(epairs, numstrong, 1., 0.);
            double[] centroid = Centroid.make(relation, group).getArrayRef();
            Cluster<CorrelationModel> correlationCluster = new Cluster<>("[" + dim + "_" + correlationClusters.size() + "]", group, new CorrelationModel(pcares, centroid));
            correlationClusters.add(correlationCluster);
        } else // partition containing noise
        {
            if (noise == null) {
                noise = clus;
            } else {
                ModifiableDBIDs merged = DBIDUtil.newHashSet(noise.getIDs());
                merged.addDBIDs(clus.getIDs());
                noise.setIDs(merged);
            }
        }
    }
    if (noise != null && noise.size() > 0) {
        // get cluster list for this dimension.
        List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(dimensionality);
        EigenPairFilter filter = new FirstNEigenPairFilter(dimensionality);
        SortedEigenPairs epairs = settings.pca.processIds(noise.getIDs(), relation).getEigenPairs();
        int numstrong = filter.filter(epairs.eigenValues());
        PCAFilteredResult pcares = new PCAFilteredResult(epairs, numstrong, 1., 0.);
        double[] centroid = Centroid.make(relation, noise.getIDs()).getArrayRef();
        Cluster<CorrelationModel> correlationCluster = new Cluster<>("[noise]", noise.getIDs(), new CorrelationModel(pcares, centroid));
        correlationClusters.add(correlationCluster);
    }
    // Delete dimensionalities not found.
    for (int i = dimensionality; i > 0; i--) {
        if (!clusterMap.get(i).isEmpty()) {
            break;
        }
        clusterMap.remove(i);
    }
    return clusterMap;
}
Also used : EigenPairFilter(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.filter.EigenPairFilter) FirstNEigenPairFilter(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.filter.FirstNEigenPairFilter) PercentageEigenPairFilter(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.filter.PercentageEigenPairFilter) FirstNEigenPairFilter(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.filter.FirstNEigenPairFilter) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) ArrayList(java.util.ArrayList) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) CorrelationModel(de.lmu.ifi.dbs.elki.data.model.CorrelationModel) CorrelationModel(de.lmu.ifi.dbs.elki.data.model.CorrelationModel) Model(de.lmu.ifi.dbs.elki.data.model.Model) SortedEigenPairs(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.SortedEigenPairs) ArrayList(java.util.ArrayList) List(java.util.List) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) PCAFilteredResult(de.lmu.ifi.dbs.elki.math.linearalgebra.pca.PCAFilteredResult)

Example 18 with Model

use of de.lmu.ifi.dbs.elki.data.model.Model in project elki by elki-project.

the class ERiC method run.

/**
 * Performs the ERiC algorithm on the given database.
 *
 * @param relation Relation to process
 * @return Clustering result
 */
public Clustering<CorrelationModel> run(Database database, Relation<V> relation) {
    final int dimensionality = RelationUtil.dimensionality(relation);
    StepProgress stepprog = LOG.isVerbose() ? new StepProgress(3) : null;
    // Run Generalized DBSCAN
    LOG.beginStep(stepprog, 1, "Preprocessing local correlation dimensionalities and partitioning data");
    // FIXME: how to ensure we are running on the same relation?
    ERiCNeighborPredicate<V>.Instance npred = new ERiCNeighborPredicate<V>(settings).instantiate(database, relation);
    CorePredicate.Instance<DBIDs> cpred = new MinPtsCorePredicate(settings.minpts).instantiate(database);
    Clustering<Model> copacResult = new GeneralizedDBSCAN.Instance<>(npred, cpred, false).run();
    // extract correlation clusters
    LOG.beginStep(stepprog, 2, "Extract correlation clusters");
    List<List<Cluster<CorrelationModel>>> clusterMap = extractCorrelationClusters(copacResult, relation, dimensionality, npred);
    if (LOG.isDebugging()) {
        StringBuilder msg = new StringBuilder("Step 2: Extract correlation clusters...");
        for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
            List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(corrDim);
            msg.append("\n\ncorrDim ").append(corrDim);
            for (Cluster<CorrelationModel> cluster : correlationClusters) {
                msg.append("\n  cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
            // .append(", level: ").append(cluster.getLevel()).append(", index:
            // ").append(cluster.getLevelIndex());
            // msg.append("\n basis " +
            // cluster.getPCA().getWeakEigenvectors().toString(" ", NF) +
            // " ids " + cluster.getIDs().size());
            }
        }
        LOG.debugFine(msg.toString());
    }
    if (LOG.isVerbose()) {
        int clusters = 0;
        for (List<Cluster<CorrelationModel>> correlationClusters : clusterMap) {
            clusters += correlationClusters.size();
        }
        LOG.verbose(clusters + " clusters extracted.");
    }
    // build hierarchy
    LOG.beginStep(stepprog, 3, "Building hierarchy");
    Clustering<CorrelationModel> clustering = new Clustering<>("ERiC clustering", "eric-clustering");
    buildHierarchy(clustering, clusterMap, npred);
    if (LOG.isDebugging()) {
        StringBuilder msg = new StringBuilder("Step 3: Build hierarchy");
        for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
            List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(corrDim);
            for (Cluster<CorrelationModel> cluster : correlationClusters) {
                msg.append("\n  cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
                // ").append(cluster.getLevelIndex());
                for (It<Cluster<CorrelationModel>> iter = clustering.getClusterHierarchy().iterParents(cluster); iter.valid(); iter.advance()) {
                    msg.append("\n   parent ").append(iter.get());
                }
                for (It<Cluster<CorrelationModel>> iter = clustering.getClusterHierarchy().iterChildren(cluster); iter.valid(); iter.advance()) {
                    msg.append("\n   child ").append(iter.get());
                }
            }
        }
        LOG.debugFine(msg.toString());
    }
    LOG.setCompleted(stepprog);
    for (Cluster<CorrelationModel> rc : clusterMap.get(clusterMap.size() - 1)) {
        clustering.addToplevelCluster(rc);
    }
    return clustering;
}
Also used : ERiCNeighborPredicate(de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.ERiCNeighborPredicate) MinPtsCorePredicate(de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.MinPtsCorePredicate) CorePredicate(de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) MinPtsCorePredicate(de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.MinPtsCorePredicate) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) CorrelationModel(de.lmu.ifi.dbs.elki.data.model.CorrelationModel) CorrelationModel(de.lmu.ifi.dbs.elki.data.model.CorrelationModel) Model(de.lmu.ifi.dbs.elki.data.model.Model) GeneralizedDBSCAN(de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.GeneralizedDBSCAN) ArrayList(java.util.ArrayList) List(java.util.List)

Example 19 with Model

use of de.lmu.ifi.dbs.elki.data.model.Model in project elki by elki-project.

the class NaiveAgglomerativeHierarchicalClustering3 method run.

/**
 * Run the algorithm
 *
 * @param db Database
 * @param relation Relation
 * @return Clustering hierarchy
 */
public Result run(Database db, Relation<O> relation) {
    DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
    ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    final int size = ids.size();
    if (size > 0x10000) {
        throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow.");
    }
    if (Linkage.SINGLE.equals(linkage)) {
        LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
    }
    // Compute the initial (lower triangular) distance matrix.
    double[] scratch = new double[triangleSize(size)];
    DBIDArrayIter ix = ids.iter(), iy = ids.iter();
    // Position counter - must agree with computeOffset!
    int pos = 0;
    boolean square = Linkage.WARD.equals(linkage) && !getDistanceFunction().isSquared();
    for (int x = 0; ix.valid(); x++, ix.advance()) {
        iy.seek(0);
        for (int y = 0; y < x; y++, iy.advance()) {
            scratch[pos] = dq.distance(ix, iy);
            // Ward uses variances -- i.e. squared values
            if (square) {
                scratch[pos] *= scratch[pos];
            }
            pos++;
        }
    }
    // Initialize space for result:
    double[] height = new double[size];
    Arrays.fill(height, Double.POSITIVE_INFINITY);
    // Parent node, to track merges
    // have every object point to itself initially
    ArrayModifiableDBIDs parent = DBIDUtil.newArray(ids);
    // Active clusters, when not trivial.
    Int2ReferenceMap<ModifiableDBIDs> clusters = new Int2ReferenceOpenHashMap<>();
    // Repeat until everything merged, except the desired number of clusters:
    final int stop = size - numclusters;
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", stop, LOG) : null;
    for (int i = 0; i < stop; i++) {
        double min = Double.POSITIVE_INFINITY;
        int minx = -1, miny = -1;
        for (int x = 0; x < size; x++) {
            if (height[x] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int xbase = triangleSize(x);
            for (int y = 0; y < x; y++) {
                if (height[y] < Double.POSITIVE_INFINITY) {
                    continue;
                }
                final int idx = xbase + y;
                if (scratch[idx] < min) {
                    min = scratch[idx];
                    minx = x;
                    miny = y;
                }
            }
        }
        assert (minx >= 0 && miny >= 0);
        // Avoid allocating memory, by reusing existing iterators:
        ix.seek(minx);
        iy.seek(miny);
        // Perform merge in data structure: x -> y
        // Since y < x, prefer keeping y, dropping x.
        height[minx] = min;
        parent.set(minx, iy);
        // Merge into cluster
        ModifiableDBIDs cx = clusters.get(minx);
        ModifiableDBIDs cy = clusters.get(miny);
        // cluster sizes, for averaging
        int sizex = 1, sizey = 1;
        if (cy == null) {
            cy = DBIDUtil.newHashSet();
            cy.add(iy);
        } else {
            sizey = cy.size();
        }
        if (cx == null) {
            cy.add(ix);
        } else {
            sizex = cx.size();
            cy.addDBIDs(cx);
            clusters.remove(minx);
        }
        clusters.put(miny, cy);
        // Update distance matrix. Note: miny < minx
        // Implementation note: most will not need sizej, and could save the
        // hashmap lookup.
        final int xbase = triangleSize(minx), ybase = triangleSize(miny);
        // Write to (y, j), with j < y
        for (int j = 0; j < miny; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final DBIDs idsj = clusters.get(j);
            final int sizej = (idsj == null) ? 1 : idsj.size();
            scratch[ybase + j] = linkage.combine(sizex, scratch[xbase + j], sizey, scratch[ybase + j], sizej, min);
        }
        // Write to (j, y), with y < j < x
        for (int j = miny + 1; j < minx; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int jbase = triangleSize(j);
            final DBIDs idsj = clusters.get(j);
            final int sizej = (idsj == null) ? 1 : idsj.size();
            scratch[jbase + miny] = linkage.combine(sizex, scratch[xbase + j], sizey, scratch[jbase + miny], sizej, min);
        }
        // Write to (j, y), with y < x < j
        for (int j = minx + 1; j < size; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final DBIDs idsj = clusters.get(j);
            final int sizej = (idsj == null) ? 1 : idsj.size();
            final int jbase = triangleSize(j);
            scratch[jbase + miny] = linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min);
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    // Build the clustering result
    final Clustering<Model> dendrogram = new Clustering<>("Hierarchical-Clustering", "hierarchical-clustering");
    for (int x = 0; x < size; x++) {
        if (height[x] < Double.POSITIVE_INFINITY) {
            DBIDs cids = clusters.get(x);
            if (cids == null) {
                ix.seek(x);
                cids = DBIDUtil.deref(ix);
            }
            Cluster<Model> cluster = new Cluster<>("Cluster", cids);
            dendrogram.addToplevelCluster(cluster);
        }
    }
    return dendrogram;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) Model(de.lmu.ifi.dbs.elki.data.model.Model) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException) Int2ReferenceOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ReferenceOpenHashMap)

Example 20 with Model

use of de.lmu.ifi.dbs.elki.data.model.Model in project elki by elki-project.

the class DBSCAN method run.

/**
 * Performs the DBSCAN algorithm on the given database.
 */
public Clustering<Model> run(Relation<O> relation) {
    final int size = relation.size();
    if (size < minpts) {
        Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
        result.addToplevelCluster(new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER));
        return result;
    }
    RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction());
    resultList = new ArrayList<>();
    noise = DBIDUtil.newHashSet();
    runDBSCAN(relation, rangeQuery);
    double averagen = ncounter / (double) relation.size();
    LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen));
    if (averagen < 1 + 0.1 * (minpts - 1)) {
        LOG.warning("There are very few neighbors found. Epsilon may be too small.");
    }
    if (averagen > 100 * minpts) {
        LOG.warning("There are very many neighbors found. Epsilon may be too large.");
    }
    Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
    for (ModifiableDBIDs res : resultList) {
        result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER));
    }
    result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
    return result;
}
Also used : DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) ClusterModel(de.lmu.ifi.dbs.elki.data.model.ClusterModel) Model(de.lmu.ifi.dbs.elki.data.model.Model) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) Clustering(de.lmu.ifi.dbs.elki.data.Clustering)

Aggregations

Model (de.lmu.ifi.dbs.elki.data.model.Model)60 Database (de.lmu.ifi.dbs.elki.database.Database)29 Test (org.junit.Test)24 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)21 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)18 AbstractClusterAlgorithmTest (de.lmu.ifi.dbs.elki.algorithm.clustering.AbstractClusterAlgorithmTest)17 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)13 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)12 ELKIBuilder (de.lmu.ifi.dbs.elki.utilities.ELKIBuilder)11 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)10 ArrayList (java.util.ArrayList)9 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)8 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)8 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)8 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)7 HashMap (java.util.HashMap)5 ByLabelClustering (de.lmu.ifi.dbs.elki.algorithm.clustering.trivial.ByLabelClustering)3 SubspaceModel (de.lmu.ifi.dbs.elki.data.model.SubspaceModel)3 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)3 CorePredicate (de.lmu.ifi.dbs.elki.algorithm.clustering.gdbscan.CorePredicate)2