use of de.lmu.ifi.dbs.elki.algorithm.clustering.kmeans.initialization.PAMInitialMeans in project elki by elki-project.
the class KMedoidsEM method run.
/**
* Run k-medoids
*
* @param database Database
* @param relation relation to use
* @return result
*/
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
}
DistanceQuery<V> distQ = null;
// Only enforce a distance matrix for PAM initialization, which is slow.
if (initializer instanceof PAMInitialMeans) {
distQ = DatabaseUtil.precomputedDistanceQuery(database, relation, getDistanceFunction(), LOG);
} else {
distQ = database.getDistanceQuery(relation, getDistanceFunction());
}
// Choose initial medoids
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, relation.getDBIDs(), distQ));
DBIDArrayMIter miter = medoids.iter();
double[] mdists = new double[k];
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
HashSetModifiableDBIDs set = DBIDUtil.newHashSet(relation.size() / k);
// Add medoids.
set.add(miter.seek(i));
clusters.add(set);
}
// Initial assignment to nearest medoids
// TODO: reuse this information, from the build phase, when possible?
double tc = assignToNearestCluster(miter, mdists, clusters, distQ);
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + 0 + ".cost", tc));
}
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids EM iteration", LOG) : null;
// Swap phase
int iteration = 0;
DBIDVar best = DBIDUtil.newVar();
while (true) {
boolean changed = false;
// Try to swap the medoid with a better cluster member:
int i = 0;
for (miter.seek(0); miter.valid(); miter.advance(), i++) {
best.unset();
double bestm = mdists[i];
for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
if (DBIDUtil.equal(miter, iter)) {
continue;
}
double sum = 0;
for (DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
sum += distQ.distance(iter, iter2);
}
if (sum < bestm) {
best.set(iter);
bestm = sum;
}
}
if (best.isSet() && !DBIDUtil.equal(miter, best)) {
changed = true;
assert (clusters.get(i).contains(best));
medoids.set(i, best);
mdists[i] = bestm;
}
}
// Reassign
if (!changed) {
break;
}
double nc = assignToNearestCluster(miter, mdists, clusters, distQ);
++iteration;
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", nc));
}
LOG.incrementProcessed(prog);
}
LOG.setCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
}
// Wrap result
Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
for (DBIDArrayIter it = medoids.iter(); it.valid(); it.advance()) {
result.addToplevelCluster(new Cluster<>(clusters.get(it.getOffset()), new MedoidModel(DBIDUtil.deref(it))));
}
return result;
}
Aggregations