Search in sources :

Example 6 with MedoidModel

use of de.lmu.ifi.dbs.elki.data.model.MedoidModel in project elki by elki-project.

the class KMedoidsEM method run.

/**
 * Run k-medoids
 *
 * @param database Database
 * @param relation relation to use
 * @return result
 */
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
    }
    DistanceQuery<V> distQ = null;
    // Only enforce a distance matrix for PAM initialization, which is slow.
    if (initializer instanceof PAMInitialMeans) {
        distQ = DatabaseUtil.precomputedDistanceQuery(database, relation, getDistanceFunction(), LOG);
    } else {
        distQ = database.getDistanceQuery(relation, getDistanceFunction());
    }
    // Choose initial medoids
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, relation.getDBIDs(), distQ));
    DBIDArrayMIter miter = medoids.iter();
    double[] mdists = new double[k];
    // Setup cluster assignment store
    List<ModifiableDBIDs> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        HashSetModifiableDBIDs set = DBIDUtil.newHashSet(relation.size() / k);
        // Add medoids.
        set.add(miter.seek(i));
        clusters.add(set);
    }
    // Initial assignment to nearest medoids
    // TODO: reuse this information, from the build phase, when possible?
    double tc = assignToNearestCluster(miter, mdists, clusters, distQ);
    if (LOG.isStatistics()) {
        LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + 0 + ".cost", tc));
    }
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids EM iteration", LOG) : null;
    // Swap phase
    int iteration = 0;
    DBIDVar best = DBIDUtil.newVar();
    while (true) {
        boolean changed = false;
        // Try to swap the medoid with a better cluster member:
        int i = 0;
        for (miter.seek(0); miter.valid(); miter.advance(), i++) {
            best.unset();
            double bestm = mdists[i];
            for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
                if (DBIDUtil.equal(miter, iter)) {
                    continue;
                }
                double sum = 0;
                for (DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
                    sum += distQ.distance(iter, iter2);
                }
                if (sum < bestm) {
                    best.set(iter);
                    bestm = sum;
                }
            }
            if (best.isSet() && !DBIDUtil.equal(miter, best)) {
                changed = true;
                assert (clusters.get(i).contains(best));
                medoids.set(i, best);
                mdists[i] = bestm;
            }
        }
        // Reassign
        if (!changed) {
            break;
        }
        double nc = assignToNearestCluster(miter, mdists, clusters, distQ);
        ++iteration;
        if (LOG.isStatistics()) {
            LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", nc));
        }
        LOG.incrementProcessed(prog);
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    // Wrap result
    Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
    for (DBIDArrayIter it = medoids.iter(); it.valid(); it.advance()) {
        result.addToplevelCluster(new Cluster<>(clusters.get(it.getOffset()), new MedoidModel(DBIDUtil.deref(it))));
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) PAMInitialMeans(de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PAMInitialMeans) MedoidModel(de.lmu.ifi.dbs.elki.data.model.MedoidModel) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 7 with MedoidModel

use of de.lmu.ifi.dbs.elki.data.model.MedoidModel in project elki by elki-project.

the class AffinityPropagationClusteringAlgorithm method run.

/**
 * Perform affinity propagation clustering.
 *
 * @param db Database
 * @param relation Relation
 * @return Clustering result
 */
public Clustering<MedoidModel> run(Database db, Relation<O> relation) {
    ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    final int size = ids.size();
    int[] assignment = new int[size];
    double[][] s = initialization.getSimilarityMatrix(db, relation, ids);
    double[][] r = new double[size][size];
    double[][] a = new double[size][size];
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("Affinity Propagation Iteration", LOG) : null;
    MutableProgress aprog = LOG.isVerbose() ? new MutableProgress("Stable assignments", size + 1, LOG) : null;
    int inactive = 0;
    for (int iteration = 0; iteration < maxiter && inactive < convergence; iteration++) {
        // Update responsibility matrix:
        for (int i = 0; i < size; i++) {
            double[] ai = a[i], ri = r[i], si = s[i];
            // Find the two largest values (as initially maxk == i)
            double max1 = Double.NEGATIVE_INFINITY, max2 = Double.NEGATIVE_INFINITY;
            int maxk = -1;
            for (int k = 0; k < size; k++) {
                double val = ai[k] + si[k];
                if (val > max1) {
                    max2 = max1;
                    max1 = val;
                    maxk = k;
                } else if (val > max2) {
                    max2 = val;
                }
            }
            // With the maximum value known, update r:
            for (int k = 0; k < size; k++) {
                double val = si[k] - ((k != maxk) ? max1 : max2);
                ri[k] = ri[k] * lambda + val * (1. - lambda);
            }
        }
        // Update availability matrix
        for (int k = 0; k < size; k++) {
            // Compute sum of max(0, r_ik) for all i.
            // For r_kk, don't apply the max.
            double colposum = 0.;
            for (int i = 0; i < size; i++) {
                if (i == k || r[i][k] > 0.) {
                    colposum += r[i][k];
                }
            }
            for (int i = 0; i < size; i++) {
                double val = colposum;
                // Adjust column sum by the one extra term.
                if (i == k || r[i][k] > 0.) {
                    val -= r[i][k];
                }
                if (i != k && val > 0.) {
                    // min
                    val = 0.;
                }
                a[i][k] = a[i][k] * lambda + val * (1 - lambda);
            }
        }
        int changed = 0;
        for (int i = 0; i < size; i++) {
            double[] ai = a[i], ri = r[i];
            double max = Double.NEGATIVE_INFINITY;
            int maxj = -1;
            for (int j = 0; j < size; j++) {
                double v = ai[j] + ri[j];
                if (v > max || (i == j && v >= max)) {
                    max = v;
                    maxj = j;
                }
            }
            if (assignment[i] != maxj) {
                changed += 1;
                assignment[i] = maxj;
            }
        }
        inactive = (changed > 0) ? 0 : (inactive + 1);
        LOG.incrementProcessed(prog);
        if (aprog != null) {
            aprog.setProcessed(size - changed, LOG);
        }
    }
    if (aprog != null) {
        aprog.setProcessed(aprog.getTotal(), LOG);
    }
    LOG.setCompleted(prog);
    // Cluster map, by lead object
    Int2ObjectOpenHashMap<ModifiableDBIDs> map = new Int2ObjectOpenHashMap<>();
    DBIDArrayIter i1 = ids.iter();
    for (int i = 0; i1.valid(); i1.advance(), i++) {
        int c = assignment[i];
        // Add to cluster members:
        ModifiableDBIDs cids = map.get(c);
        if (cids == null) {
            cids = DBIDUtil.newArray();
            map.put(c, cids);
        }
        cids.add(i1);
    }
    // If we stopped early, the cluster lead might be in a different cluster.
    for (ObjectIterator<Int2ObjectOpenHashMap.Entry<ModifiableDBIDs>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
        Int2ObjectOpenHashMap.Entry<ModifiableDBIDs> entry = iter.next();
        final int key = entry.getIntKey();
        int targetkey = key;
        ModifiableDBIDs tids = null;
        // Chase arrows:
        while (ids == null && assignment[targetkey] != targetkey) {
            targetkey = assignment[targetkey];
            tids = map.get(targetkey);
        }
        if (tids != null && targetkey != key) {
            tids.addDBIDs(entry.getValue());
            iter.remove();
        }
    }
    Clustering<MedoidModel> clustering = new Clustering<>("Affinity Propagation Clustering", "ap-clustering");
    ModifiableDBIDs noise = DBIDUtil.newArray();
    for (ObjectIterator<Int2ObjectOpenHashMap.Entry<ModifiableDBIDs>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
        Int2ObjectOpenHashMap.Entry<ModifiableDBIDs> entry = iter.next();
        i1.seek(entry.getIntKey());
        if (entry.getValue().size() > 1) {
            MedoidModel mod = new MedoidModel(DBIDUtil.deref(i1));
            clustering.addToplevelCluster(new Cluster<>(entry.getValue(), mod));
        } else {
            noise.add(i1);
        }
    }
    if (noise.size() > 0) {
        MedoidModel mod = new MedoidModel(DBIDUtil.deref(noise.iter()));
        clustering.addToplevelCluster(new Cluster<>(noise, true, mod));
    }
    return clustering;
}
Also used : Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) MedoidModel(de.lmu.ifi.dbs.elki.data.model.MedoidModel) MutableProgress(de.lmu.ifi.dbs.elki.logging.progress.MutableProgress) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 8 with MedoidModel

use of de.lmu.ifi.dbs.elki.data.model.MedoidModel in project elki by elki-project.

the class KMedoidsPAM method run.

/**
 * Run k-medoids
 *
 * @param database Database
 * @param relation relation to use
 * @return result
 */
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("PAM Clustering", "pam-clustering");
    }
    if (k > 0x7FFF) {
        throw new NotImplementedException("PAM supports at most " + 0x7FFF + " clusters.");
    }
    DistanceQuery<V> distQ = DatabaseUtil.precomputedDistanceQuery(database, relation, getDistanceFunction(), LOG);
    DBIDs ids = relation.getDBIDs();
    // Choose initial medoids
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, ids, distQ));
    if (medoids.size() != k) {
        throw new AbortException("Initializer " + initializer.toString() + " did not return " + k + " means, but " + medoids.size());
    }
    // Setup cluster assignment store
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, -1);
    run(distQ, ids, medoids, assignment);
    ArrayModifiableDBIDs[] clusters = ClusteringAlgorithmUtil.partitionsFromIntegerLabels(ids, assignment, k);
    // Wrap result
    Clustering<MedoidModel> result = new Clustering<>("PAM Clustering", "pam-clustering");
    for (DBIDArrayIter it = medoids.iter(); it.valid(); it.advance()) {
        result.addToplevelCluster(new Cluster<>(clusters[it.getOffset()], new MedoidModel(DBIDUtil.deref(it))));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) NotImplementedException(de.lmu.ifi.dbs.elki.utilities.exceptions.NotImplementedException) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) MedoidModel(de.lmu.ifi.dbs.elki.data.model.MedoidModel) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 9 with MedoidModel

use of de.lmu.ifi.dbs.elki.data.model.MedoidModel in project elki by elki-project.

the class CLARA method run.

@Override
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("CLARA Clustering", "clara-clustering");
    }
    DBIDs ids = relation.getDBIDs();
    DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction());
    double best = Double.POSITIVE_INFINITY;
    ArrayModifiableDBIDs bestmedoids = null;
    WritableIntegerDataStore bestclusters = null;
    Random rnd = random.getSingleThreadedRandom();
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Processing random samples", numsamples, LOG) : null;
    for (int j = 0; j < numsamples; j++) {
        DBIDs rids = DBIDUtil.randomSample(ids, sampling, rnd);
        // FIXME: precompute and use a distance matrix for this sample!
        // Choose initial medoids
        ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, rids, distQ));
        // Setup cluster assignment store
        WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, -1);
        new /* PAM */
        Instance(distQ, rids, assignment).run(medoids, maxiter);
        double score = assignRemainingToNearestCluster(medoids, ids, rids, assignment, distQ);
        if (score < best) {
            best = score;
            bestmedoids = medoids;
            bestclusters = assignment;
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    ArrayModifiableDBIDs[] clusters = ClusteringAlgorithmUtil.partitionsFromIntegerLabels(ids, bestclusters, k);
    // Wrap result
    Clustering<MedoidModel> result = new Clustering<>("CLARA Clustering", "clara-clustering");
    for (DBIDArrayIter it = bestmedoids.iter(); it.valid(); it.advance()) {
        MedoidModel model = new MedoidModel(DBIDUtil.deref(it));
        result.addToplevelCluster(new Cluster<>(clusters[it.getOffset()], model));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) MedoidModel(de.lmu.ifi.dbs.elki.data.model.MedoidModel) Random(java.util.Random)

Example 10 with MedoidModel

use of de.lmu.ifi.dbs.elki.data.model.MedoidModel in project elki by elki-project.

the class CLARANS method run.

public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("CLARANS Clustering", "clarans-clustering");
    }
    if (k * 2 >= relation.size()) {
        // Random sampling of non-medoids will be slow for huge k
        LOG.warning("A very large k was chosen. This implementation is not optimized for this case.");
    }
    DBIDs ids = relation.getDBIDs();
    DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction());
    final boolean metric = getDistanceFunction().isMetric();
    // Number of retries, relative rate, or absolute count:
    final int retries = (int) Math.ceil(maxneighbor < 1 ? maxneighbor * ids.size() : maxneighbor);
    Random rnd = random.getSingleThreadedRandom();
    // Might copy!
    DBIDArrayIter cand = DBIDUtil.ensureArray(ids).iter();
    // Setup cluster assignment store
    Assignment best = new Assignment(distQ, ids, DBIDUtil.newArray(k));
    Assignment curr = new Assignment(distQ, ids, DBIDUtil.newArray(k));
    Assignment scratch = new Assignment(distQ, ids, DBIDUtil.newArray(k));
    // 1. initialize
    double bestscore = Double.POSITIVE_INFINITY;
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("CLARANS sampling restarts", numlocal, LOG) : null;
    for (int i = 0; i < numlocal; i++) {
        // 2. choose random initial medoids
        // TODO: should we always use uniform sampling, to be closer to the paper?
        curr.medoids.clear();
        curr.medoids.addDBIDs(DBIDUtil.randomSample(ids, k, random));
        // Cost of initial solution:
        double total = curr.assignToNearestCluster();
        // 3. Set j to 1.
        int j = 1;
        step: while (j < retries) {
            // 4 part a. choose a random non-medoid (~ neighbor in G):
            for (int r = 0; ; r++) {
                // Random point
                cand.seek(rnd.nextInt(ids.size()));
                if (curr.nearest.doubleValue(cand) > 0) {
                    // Good: not a medoid.
                    break;
                }
                // We may have many duplicate points
                if (metric && curr.second.doubleValue(cand) == 0) {
                    // Cannot yield an improvement if we are metric.
                    ++j;
                    continue step;
                } else if (!metric && !curr.medoids.contains(cand)) {
                    // Probably not a good candidate, but try nevertheless
                    break;
                }
                if (r >= 1000) {
                    throw new AbortException("Failed to choose a non-medoid in 1000 attempts. Choose k << N.");
                }
            // else: this must be the medoid.
            }
            // 4 part b. choose a random medoid to replace:
            final int otherm = rnd.nextInt(k);
            // 5. check lower cost
            double cost = curr.computeCostDifferential(cand, otherm, scratch);
            if (!(cost < 0)) {
                // 6. try again
                ++j;
                continue;
            }
            // cost is negative!
            total += cost;
            // Swap:
            Assignment tmp = curr;
            curr = scratch;
            scratch = tmp;
            j = 1;
        }
        // New best:
        if (total < bestscore) {
            // Swap:
            Assignment tmp = curr;
            curr = best;
            best = tmp;
            bestscore = total;
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    ArrayModifiableDBIDs[] clusters = ClusteringAlgorithmUtil.partitionsFromIntegerLabels(ids, best.assignment, k);
    // Wrap result
    Clustering<MedoidModel> result = new Clustering<>("CLARANS Clustering", "clarans-clustering");
    for (DBIDArrayIter it = best.medoids.iter(); it.valid(); it.advance()) {
        MedoidModel model = new MedoidModel(DBIDUtil.deref(it));
        result.addToplevelCluster(new Cluster<>(clusters[it.getOffset()], model));
    }
    return result;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) MedoidModel(de.lmu.ifi.dbs.elki.data.model.MedoidModel) Random(java.util.Random) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Aggregations

MedoidModel (de.lmu.ifi.dbs.elki.data.model.MedoidModel)14 AbstractClusterAlgorithmTest (de.lmu.ifi.dbs.elki.algorithm.clustering.AbstractClusterAlgorithmTest)9 Database (de.lmu.ifi.dbs.elki.database.Database)9 Test (org.junit.Test)9 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)7 ELKIBuilder (de.lmu.ifi.dbs.elki.utilities.ELKIBuilder)7 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)5 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)2 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)2 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)2 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)2 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)2 Random (java.util.Random)2 PAMInitialMeans (de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PAMInitialMeans)1 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)1 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)1 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)1 MutableProgress (de.lmu.ifi.dbs.elki.logging.progress.MutableProgress)1 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)1 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)1