use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class SameSizeKMeansAlgorithm method run.
/**
* Run k-means with cluster size constraints.
*
* @param database Database
* @param relation relation to use
* @return result
*/
@Override
public Clustering<MeanModel> run(Database database, Relation<V> relation) {
// Database objects to process
final DBIDs ids = relation.getDBIDs();
// Choose initial means
double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet(relation.size() / k + 2));
}
// Meta data storage
final WritableDataStore<Meta> metas = initializeMeta(relation, means);
// Perform the initial assignment
ArrayModifiableDBIDs tids = initialAssignment(clusters, metas, ids);
// Recompute the means after the initial assignment
means = means(clusters, means, relation);
// Refine the result via k-means like iterations
means = refineResult(relation, means, clusters, metas, tids);
// Wrap result
Clustering<MeanModel> result = new Clustering<>("k-Means Samesize Clustering", "kmeans-samesize-clustering");
for (int i = 0; i < clusters.size(); i++) {
result.addToplevelCluster(new Cluster<>(clusters.get(i), new MeanModel(means[i])));
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class SNNClustering method run.
/**
* Perform SNN clustering
*
* @param database Database
* @param relation Relation
* @return Result
*/
public Clustering<Model> run(Database database, Relation<O> relation) {
SimilarityQuery<O> snnInstance = similarityFunction.instantiate(relation);
FiniteProgress objprog = LOG.isVerbose() ? new FiniteProgress("SNNClustering", relation.size(), LOG) : null;
IndefiniteProgress clusprog = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;
resultList = new ArrayList<>();
noise = DBIDUtil.newHashSet();
processedIDs = DBIDUtil.newHashSet(relation.size());
if (relation.size() >= minpts) {
for (DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) {
if (!processedIDs.contains(id)) {
expandCluster(snnInstance, id, objprog, clusprog);
if (processedIDs.size() == relation.size() && noise.size() == 0) {
break;
}
}
if (objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), LOG);
clusprog.setProcessed(resultList.size(), LOG);
}
}
} else {
for (DBIDIter id = relation.iterDBIDs(); id.valid(); id.advance()) {
noise.add(id);
if (objprog != null && clusprog != null) {
objprog.setProcessed(noise.size(), LOG);
clusprog.setProcessed(resultList.size(), LOG);
}
}
}
// Finish progress logging
LOG.ensureCompleted(objprog);
LOG.setCompleted(clusprog);
Clustering<Model> result = new Clustering<>("Shared-Nearest-Neighbor Clustering", "snn-clustering");
for (Iterator<ModifiableDBIDs> resultListIter = resultList.iterator(); resultListIter.hasNext(); ) {
result.addToplevelCluster(new Cluster<Model>(resultListIter.next(), ClusterModel.CLUSTER));
}
result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
return result;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class NaiveMeanShiftClustering method run.
/**
* Run the mean-shift clustering algorithm.
*
* @param database Database
* @param relation Data relation
* @return Clustering result
*/
public Clustering<MeanModel> run(Database database, Relation<V> relation) {
final DistanceQuery<V> distq = database.getDistanceQuery(relation, getDistanceFunction());
final RangeQuery<V> rangeq = database.getRangeQuery(distq);
final NumberVector.Factory<V> factory = RelationUtil.getNumberVectorFactory(relation);
final int dim = RelationUtil.dimensionality(relation);
// Stopping threshold
final double threshold = bandwidth * 1E-10;
// Result store:
ArrayList<Pair<V, ModifiableDBIDs>> clusters = new ArrayList<>();
ModifiableDBIDs noise = DBIDUtil.newArray();
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Mean-shift clustering", relation.size(), LOG) : null;
for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
// Initial position:
V position = relation.get(iter);
iterations: for (int j = 1; j <= MAXITER; j++) {
// Compute new position:
V newvec = null;
{
DoubleDBIDList neigh = rangeq.getRangeForObject(position, bandwidth);
boolean okay = (neigh.size() > 1) || (neigh.size() >= 1 && j > 1);
if (okay) {
Centroid newpos = new Centroid(dim);
for (DoubleDBIDListIter niter = neigh.iter(); niter.valid(); niter.advance()) {
final double weight = kernel.density(niter.doubleValue() / bandwidth);
newpos.put(relation.get(niter), weight);
}
newvec = factory.newNumberVector(newpos.getArrayRef());
// TODO: detect 0 weight!
}
if (!okay) {
noise.add(iter);
break iterations;
}
}
// Test if we are close to one of the known clusters:
double bestd = Double.POSITIVE_INFINITY;
Pair<V, ModifiableDBIDs> bestp = null;
for (Pair<V, ModifiableDBIDs> pair : clusters) {
final double merged = distq.distance(newvec, pair.first);
if (merged < bestd) {
bestd = merged;
bestp = pair;
}
}
// Check for convergence:
double delta = distq.distance(position, newvec);
if (bestd < 10 * threshold || bestd * 2 < delta) {
bestp.second.add(iter);
break iterations;
}
if (j == MAXITER) {
LOG.warning("No convergence after " + MAXITER + " iterations. Distance: " + delta);
}
if (Double.isNaN(delta)) {
LOG.warning("Encountered NaN distance. Invalid center vector? " + newvec.toString());
break iterations;
}
if (j == MAXITER || delta < threshold) {
if (LOG.isDebuggingFine()) {
LOG.debugFine("New cluster:" + newvec + " delta: " + delta + " threshold: " + threshold + " bestd: " + bestd);
}
ArrayModifiableDBIDs cids = DBIDUtil.newArray();
cids.add(iter);
clusters.add(new Pair<V, ModifiableDBIDs>(newvec, cids));
break iterations;
}
position = newvec;
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
ArrayList<Cluster<MeanModel>> cs = new ArrayList<>(clusters.size());
for (Pair<V, ModifiableDBIDs> pair : clusters) {
cs.add(new Cluster<>(pair.second, new MeanModel(pair.first.toArray())));
}
if (noise.size() > 0) {
cs.add(new Cluster<MeanModel>(noise, true));
}
Clustering<MeanModel> c = new Clustering<>("Mean-shift Clustering", "mean-shift-clustering", cs);
return c;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class KMeansElkan method run.
@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
// Elkan bounds
WritableDoubleDataStore upper = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, Double.POSITIVE_INFINITY);
WritableDataStore<double[]> lower = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, double[].class);
for (DBIDIter it = relation.iterDBIDs(); it.valid(); it.advance()) {
// Filled with 0.
lower.put(it, new double[k]);
}
// Storage for updated means:
final int dim = means[0].length;
double[][] sums = new double[k][dim];
// Cluster separation
double[] sep = new double[k];
// Cluster distances
double[][] cdist = new double[k][k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
LongStatistic rstat = LOG.isStatistics() ? new LongStatistic(this.getClass().getName() + ".reassignments") : null;
int iteration = 0;
for (; maxiter <= 0 || iteration < maxiter; iteration++) {
LOG.incrementProcessed(prog);
int changed;
if (iteration == 0) {
changed = initialAssignToNearestCluster(relation, means, sums, clusters, assignment, upper, lower);
} else {
// #1
recomputeSeperation(means, sep, cdist);
changed = assignToNearestCluster(relation, means, sums, clusters, assignment, sep, cdist, upper, lower);
}
if (rstat != null) {
rstat.setLong(changed);
LOG.statistics(rstat);
}
// Stop if no cluster assignment changed.
if (changed == 0) {
break;
}
// Recompute means.
for (int i = 0; i < k; i++) {
final int s = clusters.get(i).size();
timesEquals(sums[i], s > 0 ? 1. / s : 1.);
}
// Overwrites sep
maxMoved(means, sums, sep);
updateBounds(relation, assignment, upper, lower, sep);
for (int i = 0; i < k; i++) {
final int s = clusters.get(i).size();
System.arraycopy(sums[i], 0, means[i], 0, dim);
// Restore to sum for next iteration
timesEquals(sums[i], s > 0 ? s : 1.);
}
}
LOG.setCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
}
upper.destroy();
lower.destroy();
// Wrap result
double totalvariance = 0.;
Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
for (int i = 0; i < clusters.size(); i++) {
DBIDs ids = clusters.get(i);
if (ids.size() == 0) {
continue;
}
double[] mean = means[i];
double varsum = 0.;
if (varstat) {
DoubleVector mvec = DoubleVector.wrap(mean);
for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
varsum += distanceFunction.distance(mvec, relation.get(it));
}
totalvariance += varsum;
}
KMeansModel model = new KMeansModel(mean, varsum);
result.addToplevelCluster(new Cluster<>(ids, model));
}
if (LOG.isStatistics() && varstat) {
LOG.statistics(new DoubleStatistic(this.getClass().getName() + ".variance-sum", totalvariance));
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class KMeansHybridLloydMacQueen method run.
@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
double[] varsum = new double[k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
int iteration = 0;
for (; maxiter <= 0 || iteration < maxiter; iteration += 2) {
{
// MacQueen
LOG.incrementProcessed(prog);
boolean changed = macQueenIterate(relation, means, clusters, assignment, varsum);
logVarstat(varstat, varsum);
if (!changed) {
break;
}
}
{
// Lloyd
LOG.incrementProcessed(prog);
boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum);
logVarstat(varstat, varsum);
// Stop if no cluster assignment changed.
if (!changed) {
break;
}
// Recompute means.
means = means(clusters, means, relation);
}
}
LOG.setCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
}
// Wrap result
Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
for (int i = 0; i < clusters.size(); i++) {
DBIDs ids = clusters.get(i);
if (ids.size() == 0) {
continue;
}
KMeansModel model = new KMeansModel(means[i], varsum[i]);
result.addToplevelCluster(new Cluster<>(ids, model));
}
return result;
}
Aggregations