use of de.lmu.ifi.dbs.elki.data.model.MedoidModel in project elki by elki-project.
the class KMedoidsEM method run.
/**
* Run k-medoids
*
* @param database Database
* @param relation relation to use
* @return result
*/
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
}
DistanceQuery<V> distQ = null;
// Only enforce a distance matrix for PAM initialization, which is slow.
if (initializer instanceof PAMInitialMeans) {
distQ = DatabaseUtil.precomputedDistanceQuery(database, relation, getDistanceFunction(), LOG);
} else {
distQ = database.getDistanceQuery(relation, getDistanceFunction());
}
// Choose initial medoids
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, relation.getDBIDs(), distQ));
DBIDArrayMIter miter = medoids.iter();
double[] mdists = new double[k];
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
HashSetModifiableDBIDs set = DBIDUtil.newHashSet(relation.size() / k);
// Add medoids.
set.add(miter.seek(i));
clusters.add(set);
}
// Initial assignment to nearest medoids
// TODO: reuse this information, from the build phase, when possible?
double tc = assignToNearestCluster(miter, mdists, clusters, distQ);
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + 0 + ".cost", tc));
}
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids EM iteration", LOG) : null;
// Swap phase
int iteration = 0;
DBIDVar best = DBIDUtil.newVar();
while (true) {
boolean changed = false;
// Try to swap the medoid with a better cluster member:
int i = 0;
for (miter.seek(0); miter.valid(); miter.advance(), i++) {
best.unset();
double bestm = mdists[i];
for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
if (DBIDUtil.equal(miter, iter)) {
continue;
}
double sum = 0;
for (DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
sum += distQ.distance(iter, iter2);
}
if (sum < bestm) {
best.set(iter);
bestm = sum;
}
}
if (best.isSet() && !DBIDUtil.equal(miter, best)) {
changed = true;
assert (clusters.get(i).contains(best));
medoids.set(i, best);
mdists[i] = bestm;
}
}
// Reassign
if (!changed) {
break;
}
double nc = assignToNearestCluster(miter, mdists, clusters, distQ);
++iteration;
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", nc));
}
LOG.incrementProcessed(prog);
}
LOG.setCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
}
// Wrap result
Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
for (DBIDArrayIter it = medoids.iter(); it.valid(); it.advance()) {
result.addToplevelCluster(new Cluster<>(clusters.get(it.getOffset()), new MedoidModel(DBIDUtil.deref(it))));
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.model.MedoidModel in project elki by elki-project.
the class AffinityPropagationClusteringAlgorithm method run.
/**
* Perform affinity propagation clustering.
*
* @param db Database
* @param relation Relation
* @return Clustering result
*/
public Clustering<MedoidModel> run(Database db, Relation<O> relation) {
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
final int size = ids.size();
int[] assignment = new int[size];
double[][] s = initialization.getSimilarityMatrix(db, relation, ids);
double[][] r = new double[size][size];
double[][] a = new double[size][size];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("Affinity Propagation Iteration", LOG) : null;
MutableProgress aprog = LOG.isVerbose() ? new MutableProgress("Stable assignments", size + 1, LOG) : null;
int inactive = 0;
for (int iteration = 0; iteration < maxiter && inactive < convergence; iteration++) {
// Update responsibility matrix:
for (int i = 0; i < size; i++) {
double[] ai = a[i], ri = r[i], si = s[i];
// Find the two largest values (as initially maxk == i)
double max1 = Double.NEGATIVE_INFINITY, max2 = Double.NEGATIVE_INFINITY;
int maxk = -1;
for (int k = 0; k < size; k++) {
double val = ai[k] + si[k];
if (val > max1) {
max2 = max1;
max1 = val;
maxk = k;
} else if (val > max2) {
max2 = val;
}
}
// With the maximum value known, update r:
for (int k = 0; k < size; k++) {
double val = si[k] - ((k != maxk) ? max1 : max2);
ri[k] = ri[k] * lambda + val * (1. - lambda);
}
}
// Update availability matrix
for (int k = 0; k < size; k++) {
// Compute sum of max(0, r_ik) for all i.
// For r_kk, don't apply the max.
double colposum = 0.;
for (int i = 0; i < size; i++) {
if (i == k || r[i][k] > 0.) {
colposum += r[i][k];
}
}
for (int i = 0; i < size; i++) {
double val = colposum;
// Adjust column sum by the one extra term.
if (i == k || r[i][k] > 0.) {
val -= r[i][k];
}
if (i != k && val > 0.) {
// min
val = 0.;
}
a[i][k] = a[i][k] * lambda + val * (1 - lambda);
}
}
int changed = 0;
for (int i = 0; i < size; i++) {
double[] ai = a[i], ri = r[i];
double max = Double.NEGATIVE_INFINITY;
int maxj = -1;
for (int j = 0; j < size; j++) {
double v = ai[j] + ri[j];
if (v > max || (i == j && v >= max)) {
max = v;
maxj = j;
}
}
if (assignment[i] != maxj) {
changed += 1;
assignment[i] = maxj;
}
}
inactive = (changed > 0) ? 0 : (inactive + 1);
LOG.incrementProcessed(prog);
if (aprog != null) {
aprog.setProcessed(size - changed, LOG);
}
}
if (aprog != null) {
aprog.setProcessed(aprog.getTotal(), LOG);
}
LOG.setCompleted(prog);
// Cluster map, by lead object
Int2ObjectOpenHashMap<ModifiableDBIDs> map = new Int2ObjectOpenHashMap<>();
DBIDArrayIter i1 = ids.iter();
for (int i = 0; i1.valid(); i1.advance(), i++) {
int c = assignment[i];
// Add to cluster members:
ModifiableDBIDs cids = map.get(c);
if (cids == null) {
cids = DBIDUtil.newArray();
map.put(c, cids);
}
cids.add(i1);
}
// If we stopped early, the cluster lead might be in a different cluster.
for (ObjectIterator<Int2ObjectOpenHashMap.Entry<ModifiableDBIDs>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
Int2ObjectOpenHashMap.Entry<ModifiableDBIDs> entry = iter.next();
final int key = entry.getIntKey();
int targetkey = key;
ModifiableDBIDs tids = null;
// Chase arrows:
while (ids == null && assignment[targetkey] != targetkey) {
targetkey = assignment[targetkey];
tids = map.get(targetkey);
}
if (tids != null && targetkey != key) {
tids.addDBIDs(entry.getValue());
iter.remove();
}
}
Clustering<MedoidModel> clustering = new Clustering<>("Affinity Propagation Clustering", "ap-clustering");
ModifiableDBIDs noise = DBIDUtil.newArray();
for (ObjectIterator<Int2ObjectOpenHashMap.Entry<ModifiableDBIDs>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
Int2ObjectOpenHashMap.Entry<ModifiableDBIDs> entry = iter.next();
i1.seek(entry.getIntKey());
if (entry.getValue().size() > 1) {
MedoidModel mod = new MedoidModel(DBIDUtil.deref(i1));
clustering.addToplevelCluster(new Cluster<>(entry.getValue(), mod));
} else {
noise.add(i1);
}
}
if (noise.size() > 0) {
MedoidModel mod = new MedoidModel(DBIDUtil.deref(noise.iter()));
clustering.addToplevelCluster(new Cluster<>(noise, true, mod));
}
return clustering;
}
use of de.lmu.ifi.dbs.elki.data.model.MedoidModel in project elki by elki-project.
the class KMedoidsPAM method run.
/**
* Run k-medoids
*
* @param database Database
* @param relation relation to use
* @return result
*/
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("PAM Clustering", "pam-clustering");
}
if (k > 0x7FFF) {
throw new NotImplementedException("PAM supports at most " + 0x7FFF + " clusters.");
}
DistanceQuery<V> distQ = DatabaseUtil.precomputedDistanceQuery(database, relation, getDistanceFunction(), LOG);
DBIDs ids = relation.getDBIDs();
// Choose initial medoids
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, ids, distQ));
if (medoids.size() != k) {
throw new AbortException("Initializer " + initializer.toString() + " did not return " + k + " means, but " + medoids.size());
}
// Setup cluster assignment store
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, -1);
run(distQ, ids, medoids, assignment);
ArrayModifiableDBIDs[] clusters = ClusteringAlgorithmUtil.partitionsFromIntegerLabels(ids, assignment, k);
// Wrap result
Clustering<MedoidModel> result = new Clustering<>("PAM Clustering", "pam-clustering");
for (DBIDArrayIter it = medoids.iter(); it.valid(); it.advance()) {
result.addToplevelCluster(new Cluster<>(clusters[it.getOffset()], new MedoidModel(DBIDUtil.deref(it))));
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.model.MedoidModel in project elki by elki-project.
the class CLARA method run.
@Override
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("CLARA Clustering", "clara-clustering");
}
DBIDs ids = relation.getDBIDs();
DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction());
double best = Double.POSITIVE_INFINITY;
ArrayModifiableDBIDs bestmedoids = null;
WritableIntegerDataStore bestclusters = null;
Random rnd = random.getSingleThreadedRandom();
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Processing random samples", numsamples, LOG) : null;
for (int j = 0; j < numsamples; j++) {
DBIDs rids = DBIDUtil.randomSample(ids, sampling, rnd);
// FIXME: precompute and use a distance matrix for this sample!
// Choose initial medoids
ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, rids, distQ));
// Setup cluster assignment store
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, -1);
new /* PAM */
Instance(distQ, rids, assignment).run(medoids, maxiter);
double score = assignRemainingToNearestCluster(medoids, ids, rids, assignment, distQ);
if (score < best) {
best = score;
bestmedoids = medoids;
bestclusters = assignment;
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
ArrayModifiableDBIDs[] clusters = ClusteringAlgorithmUtil.partitionsFromIntegerLabels(ids, bestclusters, k);
// Wrap result
Clustering<MedoidModel> result = new Clustering<>("CLARA Clustering", "clara-clustering");
for (DBIDArrayIter it = bestmedoids.iter(); it.valid(); it.advance()) {
MedoidModel model = new MedoidModel(DBIDUtil.deref(it));
result.addToplevelCluster(new Cluster<>(clusters[it.getOffset()], model));
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.model.MedoidModel in project elki by elki-project.
the class CLARANS method run.
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("CLARANS Clustering", "clarans-clustering");
}
if (k * 2 >= relation.size()) {
// Random sampling of non-medoids will be slow for huge k
LOG.warning("A very large k was chosen. This implementation is not optimized for this case.");
}
DBIDs ids = relation.getDBIDs();
DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction());
final boolean metric = getDistanceFunction().isMetric();
// Number of retries, relative rate, or absolute count:
final int retries = (int) Math.ceil(maxneighbor < 1 ? maxneighbor * ids.size() : maxneighbor);
Random rnd = random.getSingleThreadedRandom();
// Might copy!
DBIDArrayIter cand = DBIDUtil.ensureArray(ids).iter();
// Setup cluster assignment store
Assignment best = new Assignment(distQ, ids, DBIDUtil.newArray(k));
Assignment curr = new Assignment(distQ, ids, DBIDUtil.newArray(k));
Assignment scratch = new Assignment(distQ, ids, DBIDUtil.newArray(k));
// 1. initialize
double bestscore = Double.POSITIVE_INFINITY;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("CLARANS sampling restarts", numlocal, LOG) : null;
for (int i = 0; i < numlocal; i++) {
// 2. choose random initial medoids
// TODO: should we always use uniform sampling, to be closer to the paper?
curr.medoids.clear();
curr.medoids.addDBIDs(DBIDUtil.randomSample(ids, k, random));
// Cost of initial solution:
double total = curr.assignToNearestCluster();
// 3. Set j to 1.
int j = 1;
step: while (j < retries) {
// 4 part a. choose a random non-medoid (~ neighbor in G):
for (int r = 0; ; r++) {
// Random point
cand.seek(rnd.nextInt(ids.size()));
if (curr.nearest.doubleValue(cand) > 0) {
// Good: not a medoid.
break;
}
// We may have many duplicate points
if (metric && curr.second.doubleValue(cand) == 0) {
// Cannot yield an improvement if we are metric.
++j;
continue step;
} else if (!metric && !curr.medoids.contains(cand)) {
// Probably not a good candidate, but try nevertheless
break;
}
if (r >= 1000) {
throw new AbortException("Failed to choose a non-medoid in 1000 attempts. Choose k << N.");
}
// else: this must be the medoid.
}
// 4 part b. choose a random medoid to replace:
final int otherm = rnd.nextInt(k);
// 5. check lower cost
double cost = curr.computeCostDifferential(cand, otherm, scratch);
if (!(cost < 0)) {
// 6. try again
++j;
continue;
}
// cost is negative!
total += cost;
// Swap:
Assignment tmp = curr;
curr = scratch;
scratch = tmp;
j = 1;
}
// New best:
if (total < bestscore) {
// Swap:
Assignment tmp = curr;
curr = best;
best = tmp;
bestscore = total;
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
ArrayModifiableDBIDs[] clusters = ClusteringAlgorithmUtil.partitionsFromIntegerLabels(ids, best.assignment, k);
// Wrap result
Clustering<MedoidModel> result = new Clustering<>("CLARANS Clustering", "clarans-clustering");
for (DBIDArrayIter it = best.medoids.iter(); it.valid(); it.advance()) {
MedoidModel model = new MedoidModel(DBIDUtil.deref(it));
result.addToplevelCluster(new Cluster<>(clusters[it.getOffset()], model));
}
return result;
}
Aggregations