Search in sources :

Example 66 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class KMeansSort method run.

@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];
    // Cluster distances
    double[][] cdist = new double[k][k];
    int[][] cnum = new int[k][k - 1];
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
    LongStatistic diststat = LOG.isStatistics() ? new LongStatistic(KEY + ".distance-computations") : null;
    int iteration = 0;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
        LOG.incrementProcessed(prog);
        recomputeSeperation(means, cdist, cnum, diststat);
        boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum, cdist, cnum, diststat);
        logVarstat(varstat, varsum);
        if (LOG.isStatistics()) {
            LOG.statistics(diststat);
        }
        // Stop if no cluster assignment changed.
        if (!changed) {
            break;
        }
        // Recompute means.
        means = means(clusters, means, relation);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.size(); i++) {
        DBIDs ids = clusters.get(i);
        if (ids.size() == 0) {
            continue;
        }
        KMeansModel model = new KMeansModel(means[i], varsum[i]);
        result.addToplevelCluster(new Cluster<>(ids, model));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 67 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class KMeansSort method assignToNearestCluster.

/**
 * Reassign objects, but only if their bounds indicate it is necessary to do
 * so.
 *
 * @param relation Data
 * @param means Current means
 * @param clusters Current clusters
 * @param assignment Cluster assignment
 * @param varsum Variance sum counter
 * @param cdist Centroid distances
 * @param cnum Centroid nearest neighbors
 * @param diststat Distance statistics
 * @return true when the object was reassigned
 */
private boolean assignToNearestCluster(Relation<V> relation, double[][] means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] varsum, double[][] cdist, int[][] cnum, LongStatistic diststat) {
    assert (k == means.length);
    long dists = 0;
    boolean changed = false;
    // Reset all clusters
    Arrays.fill(varsum, 0.);
    for (ModifiableDBIDs cluster : clusters) {
        cluster.clear();
    }
    double mult = (distanceFunction instanceof SquaredEuclideanDistanceFunction) ? 4 : 2;
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        final int cur = assignment.intValue(iditer), ini = cur >= 0 ? cur : 0;
        // Distance to current mean:
        V fv = relation.get(iditer);
        double mindist = distanceFunction.distance(fv, DoubleVector.wrap(means[ini]));
        ++dists;
        final double threshold = mult * mindist;
        int minIndex = ini;
        for (int i : cnum[ini]) {
            if (cdist[minIndex][i] >= threshold) {
                // All following can only be worse.
                break;
            }
            double dist = distanceFunction.distance(fv, DoubleVector.wrap(means[i]));
            ++dists;
            if (dist < mindist) {
                minIndex = i;
                mindist = dist;
            }
        }
        varsum[minIndex] += mindist;
        clusters.get(minIndex).add(iditer);
        changed |= assignment.putInt(iditer, minIndex) != minIndex;
    }
    // Increment distance computations counter.
    if (diststat != null) {
        diststat.increment(dists);
    }
    return changed;
}
Also used : SquaredEuclideanDistanceFunction(de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 68 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class OPTICSXi method extractClusters.

/**
 * Extract clusters from a cluster order result.
 *
 * @param clusterOrderResult cluster order result
 * @param relation Relation
 * @param ixi Parameter 1 - Xi
 * @param minpts Parameter minPts
 */
private Clustering<OPTICSModel> extractClusters(ClusterOrder clusterOrderResult, Relation<?> relation, double ixi, int minpts) {
    ArrayDBIDs clusterOrder = clusterOrderResult.ids;
    DoubleDataStore reach = clusterOrderResult.reachability;
    DBIDArrayIter tmp = clusterOrder.iter();
    DBIDVar tmp2 = DBIDUtil.newVar();
    double mib = 0.0;
    List<SteepArea> salist = keepsteep ? new ArrayList<SteepArea>() : null;
    List<SteepDownArea> sdaset = new ArrayList<>();
    final Clustering<OPTICSModel> clustering = new Clustering<>("OPTICS Xi-Clusters", "optics");
    HashSet<Cluster<OPTICSModel>> curclusters = new HashSet<>();
    HashSetModifiableDBIDs unclaimedids = DBIDUtil.newHashSet(relation.getDBIDs());
    FiniteProgress scanprog = LOG.isVerbose() ? new FiniteProgress("OPTICS Xi cluster extraction", clusterOrder.size(), LOG) : null;
    for (SteepScanPosition scan = new SteepScanPosition(clusterOrderResult); scan.hasNext(); ) {
        if (scanprog != null) {
            scanprog.setProcessed(scan.index, LOG);
        }
        // Update maximum-inbetween
        mib = MathUtil.max(mib, scan.getReachability());
        // The last point cannot be the start of a steep area.
        if (!scan.next.valid()) {
            break;
        }
        // Xi-steep down area
        if (scan.steepDown(ixi)) {
            // Update mib values with current mib and filter
            updateFilterSDASet(mib, sdaset, ixi);
            final double startval = scan.getReachability();
            mib = 0.;
            int startsteep = scan.index, endsteep = scan.index;
            for (scan.next(); scan.hasNext(); scan.next()) {
                // still steep - continue.
                if (scan.steepDown(ixi)) {
                    endsteep = scan.index;
                    continue;
                }
                // Always stop looking after minpts "flat" steps.
                if (!scan.steepDown(1.0) || scan.index - endsteep > minpts) {
                    break;
                }
            }
            final SteepDownArea sda = new SteepDownArea(startsteep, endsteep, startval, 0);
            if (LOG.isDebuggingFinest()) {
                LOG.debugFinest("New steep down area: " + sda.toString());
            }
            sdaset.add(sda);
            if (salist != null) {
                salist.add(sda);
            }
            continue;
        }
        // Xi-steep up area
        if (scan.steepUp(ixi)) {
            // Update mib values with current mib and filter
            updateFilterSDASet(mib, sdaset, ixi);
            final SteepUpArea sua;
            // Compute steep-up area
            {
                int startsteep = scan.index, endsteep = scan.index;
                mib = scan.getReachability();
                double esuccr = scan.getNextReachability();
                // Find end of steep-up-area, eventually updating mib again
                while (!Double.isInfinite(esuccr) && scan.hasNext()) {
                    scan.next();
                    // still steep - continue.
                    if (scan.steepUp(ixi)) {
                        endsteep = scan.index;
                        mib = scan.getReachability();
                        esuccr = scan.getNextReachability();
                        continue;
                    }
                    // Stop looking after minpts non-up steps.
                    if (!scan.steepUp(1.0) || scan.index - endsteep > minpts) {
                        break;
                    }
                }
                if (Double.isInfinite(esuccr)) {
                    scan.next();
                }
                sua = new SteepUpArea(startsteep, endsteep, esuccr);
                if (LOG.isDebuggingFinest()) {
                    LOG.debugFinest("New steep up area: " + sua.toString());
                }
                if (salist != null) {
                    salist.add(sua);
                }
            }
            // Validate and computer clusters
            // LOG.debug("SDA size:"+sdaset.size()+" "+sdaset);
            ListIterator<SteepDownArea> sdaiter = sdaset.listIterator(sdaset.size());
            // Iterate backwards for correct hierarchy generation.
            while (sdaiter.hasPrevious()) {
                SteepDownArea sda = sdaiter.previous();
                if (LOG.isDebuggingFinest()) {
                    LOG.debugFinest("Comparing: eU=" + mib + " SDA: " + sda.toString());
                }
                // Condition 3b: end-of-steep-up > maximum-in-between lower
                if (mib * ixi < sda.getMib()) {
                    if (LOG.isDebuggingFinest()) {
                        LOG.debugFinest("mib * ixi = " + mib * ixi + " >= sda.getMib() = " + sda.getMib());
                    }
                    continue;
                }
                // By default, clusters cover both the steep up and steep down area
                int cstart = sda.getStartIndex(), cend = MathUtil.min(sua.getEndIndex(), clusterOrder.size() - 1);
                // However, we sometimes have to adjust this (Condition 4):
                {
                    // Case b)
                    if (sda.getMaximum() * ixi >= sua.getMaximum()) {
                        while (// 
                        cstart < cend && reach.doubleValue(tmp.seek(cstart + 1)) > sua.getMaximum()) {
                            cstart++;
                        }
                    } else // Case c)
                    if (sua.getMaximum() * ixi >= sda.getMaximum()) {
                        while (// 
                        cend > cstart && reach.doubleValue(tmp.seek(cend - 1)) > sda.getMaximum()) {
                            cend--;
                        }
                    }
                // Case a) is the default
                }
                // removes common artifacts from the Xi method
                if (!nocorrect) {
                    simplify: while (cend > cstart) {
                        clusterOrderResult.predecessor.assignVar(tmp.seek(cend), tmp2);
                        for (int i = cstart; i < cend; i++) {
                            if (DBIDUtil.equal(tmp2, tmp.seek(i))) {
                                break simplify;
                            }
                        }
                        // Not found.
                        --cend;
                    }
                }
                // Condition 3a: obey minpts
                if (cend - cstart + 1 < minpts) {
                    if (LOG.isDebuggingFinest()) {
                        LOG.debugFinest("MinPts not satisfied.");
                    }
                    continue;
                }
                // Build the cluster
                ModifiableDBIDs dbids = DBIDUtil.newArray();
                for (int idx = cstart; idx <= cend; idx++) {
                    tmp.seek(idx);
                    // Collect only unclaimed IDs.
                    if (unclaimedids.remove(tmp)) {
                        dbids.add(tmp);
                    }
                }
                if (LOG.isDebuggingFine()) {
                    LOG.debugFine("Found cluster with " + dbids.size() + " new objects, length " + (cend - cstart + 1));
                }
                OPTICSModel model = new OPTICSModel(cstart, cend);
                Cluster<OPTICSModel> cluster = new Cluster<>("Cluster_" + cstart + "_" + cend, dbids, model);
                // Build the hierarchy
                {
                    Iterator<Cluster<OPTICSModel>> iter = curclusters.iterator();
                    while (iter.hasNext()) {
                        Cluster<OPTICSModel> clus = iter.next();
                        OPTICSModel omodel = clus.getModel();
                        if (model.getStartIndex() <= omodel.getStartIndex() && omodel.getEndIndex() <= model.getEndIndex()) {
                            clustering.addChildCluster(cluster, clus);
                            iter.remove();
                        }
                    }
                }
                curclusters.add(cluster);
            }
            continue;
        }
        // Flat - advance anyway.
        scan.next();
    }
    if (scanprog != null) {
        scanprog.setProcessed(clusterOrder.size(), LOG);
    }
    if (!unclaimedids.isEmpty()) {
        boolean noise = reach.doubleValue(tmp.seek(clusterOrder.size() - 1)) >= Double.POSITIVE_INFINITY;
        Cluster<OPTICSModel> allcluster = new Cluster<>(noise ? "Noise" : "Cluster", unclaimedids, noise, new OPTICSModel(0, clusterOrder.size() - 1));
        for (Cluster<OPTICSModel> cluster : curclusters) {
            clustering.addChildCluster(allcluster, cluster);
        }
        clustering.addToplevelCluster(allcluster);
    } else {
        for (Cluster<OPTICSModel> cluster : curclusters) {
            clustering.addToplevelCluster(cluster);
        }
    }
    clustering.addChildResult(clusterOrderResult);
    if (salist != null) {
        clusterOrderResult.addChildResult(new SteepAreaResult(salist));
    }
    return clustering;
}
Also used : OPTICSModel(de.lmu.ifi.dbs.elki.data.model.OPTICSModel) ArrayList(java.util.ArrayList) DoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.DoubleDataStore) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) ListIterator(java.util.ListIterator) Iterator(java.util.Iterator) HashSet(java.util.HashSet) DBIDVar(de.lmu.ifi.dbs.elki.database.ids.DBIDVar) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 69 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class KMeansCompare method assignToNearestCluster.

/**
 * Reassign objects, but only if their bounds indicate it is necessary to do
 * so.
 *
 * @param relation Data
 * @param means Current means
 * @param clusters Current clusters
 * @param assignment Cluster assignment
 * @param varsum Variance sum counter
 * @param cdist Centroid distances
 * @param diststat Distance statistics
 * @return true when the object was reassigned
 */
private boolean assignToNearestCluster(Relation<V> relation, double[][] means, List<ModifiableDBIDs> clusters, WritableIntegerDataStore assignment, double[] varsum, double[][] cdist, LongStatistic diststat) {
    assert (k == means.length);
    long dists = 0;
    boolean changed = false;
    // Reset all clusters
    Arrays.fill(varsum, 0.);
    for (ModifiableDBIDs cluster : clusters) {
        cluster.clear();
    }
    double mult = (distanceFunction instanceof SquaredEuclideanDistanceFunction) ? 4 : 2;
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        final int cur = assignment.intValue(iditer), ini = cur >= 0 ? cur : 0;
        // Distance to current mean:
        V fv = relation.get(iditer);
        double mindist = distanceFunction.distance(fv, DoubleVector.wrap(means[ini]));
        ++dists;
        final double thresh = mult * mindist;
        int minIndex = ini;
        for (int i = 0; i < k; i++) {
            if (i == ini || cdist[minIndex][i] >= thresh) {
                // Compare pruning
                continue;
            }
            double dist = distanceFunction.distance(fv, DoubleVector.wrap(means[i]));
            ++dists;
            if (dist < mindist) {
                minIndex = i;
                mindist = dist;
            }
        }
        varsum[minIndex] += mindist;
        clusters.get(minIndex).add(iditer);
        changed |= assignment.putInt(iditer, minIndex) != minIndex;
    }
    // Increment distance computations counter.
    if (diststat != null) {
        diststat.increment(dists);
    }
    return changed;
}
Also used : SquaredEuclideanDistanceFunction(de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 70 with ModifiableDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs in project elki by elki-project.

the class KMeansCompare method run.

@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];
    // Cluster distances
    double[][] cdist = new double[k][k];
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
    LongStatistic diststat = LOG.isStatistics() ? new LongStatistic(KEY + ".distance-computations") : null;
    int iteration = 0;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
        LOG.incrementProcessed(prog);
        recomputeSeperation(means, cdist, diststat);
        boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum, cdist, diststat);
        logVarstat(varstat, varsum);
        if (LOG.isStatistics()) {
            LOG.statistics(diststat);
        }
        // Stop if no cluster assignment changed.
        if (!changed) {
            break;
        }
        // Recompute means.
        means = means(clusters, means, relation);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < clusters.size(); i++) {
        DBIDs ids = clusters.get(i);
        if (ids.size() == 0) {
            continue;
        }
        KMeansModel model = new KMeansModel(means[i], varsum[i]);
        result.addToplevelCluster(new Cluster<>(ids, model));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Aggregations

ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)80 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)44 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)30 ArrayList (java.util.ArrayList)30 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)28 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)18 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)15 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)14 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)14 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)12 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)12 Model (de.lmu.ifi.dbs.elki.data.model.Model)11 DBID (de.lmu.ifi.dbs.elki.database.ids.DBID)11 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)10 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)10 Cluster (de.lmu.ifi.dbs.elki.data.Cluster)9 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)9 HashSetModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs)8 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)8 ClusterModel (de.lmu.ifi.dbs.elki.data.model.ClusterModel)7