use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.
the class NaiveAgglomerativeHierarchicalClustering4 method run.
/**
* Run the algorithm
*
* @param db Database
* @param relation Relation
* @return Clustering hierarchy
*/
public PointerHierarchyRepresentationResult run(Database db, Relation<O> relation) {
DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
final int size = ids.size();
if (size > 0x10000) {
throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow.");
}
if (Linkage.SINGLE.equals(linkage)) {
LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
}
// Compute the initial (lower triangular) distance matrix.
double[] scratch = new double[triangleSize(size)];
DBIDArrayIter ix = ids.iter(), iy = ids.iter(), ij = ids.iter();
// Position counter - must agree with computeOffset!
int pos = 0;
boolean square = Linkage.WARD.equals(linkage) && !getDistanceFunction().isSquared();
for (int x = 0; ix.valid(); x++, ix.advance()) {
iy.seek(0);
for (int y = 0; y < x; y++, iy.advance()) {
scratch[pos] = dq.distance(ix, iy);
// Ward uses variances -- i.e. squared values
if (square) {
scratch[pos] *= scratch[pos];
}
pos++;
}
}
// Initialize space for result:
WritableDBIDDataStore parent = DataStoreUtil.makeDBIDStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
WritableDoubleDataStore height = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
WritableIntegerDataStore csize = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
parent.put(it, it);
height.put(it, Double.POSITIVE_INFINITY);
csize.put(it, 1);
}
// Repeat until everything merged, except the desired number of clusters:
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", size - 1, LOG) : null;
for (int i = 1; i < size; i++) {
double min = Double.POSITIVE_INFINITY;
int minx = -1, miny = -1;
for (ix.seek(0); ix.valid(); ix.advance()) {
if (height.doubleValue(ix) < Double.POSITIVE_INFINITY) {
continue;
}
final int xbase = triangleSize(ix.getOffset());
for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) {
if (height.doubleValue(iy) < Double.POSITIVE_INFINITY) {
continue;
}
final int idx = xbase + iy.getOffset();
if (scratch[idx] <= min) {
min = scratch[idx];
minx = ix.getOffset();
miny = iy.getOffset();
}
}
}
assert (minx >= 0 && miny >= 0);
// Avoid allocating memory, by reusing existing iterators:
ix.seek(minx);
iy.seek(miny);
// Perform merge in data structure: x -> y
// Since y < x, prefer keeping y, dropping x.
int sizex = csize.intValue(ix), sizey = csize.intValue(iy);
height.put(ix, min);
parent.put(ix, iy);
csize.put(iy, sizex + sizey);
// Update distance matrix. Note: miny < minx
final int xbase = triangleSize(minx), ybase = triangleSize(miny);
// Write to (y, j), with j < y
for (ij.seek(0); ij.getOffset() < miny; ij.advance()) {
if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
continue;
}
final int sizej = csize.intValue(ij);
scratch[ybase + ij.getOffset()] = linkage.combine(sizex, scratch[xbase + ij.getOffset()], sizey, scratch[ybase + ij.getOffset()], sizej, min);
}
// Write to (j, y), with y < j < x
for (ij.seek(miny + 1); ij.getOffset() < minx; ij.advance()) {
if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
continue;
}
final int jbase = triangleSize(ij.getOffset());
final int sizej = csize.intValue(ij);
scratch[jbase + miny] = linkage.combine(sizex, scratch[xbase + ij.getOffset()], sizey, scratch[jbase + miny], sizej, min);
}
// Write to (j, y), with y < x < j
for (ij.seek(minx + 1); ij.valid(); ij.advance()) {
if (height.doubleValue(ij) < Double.POSITIVE_INFINITY) {
continue;
}
final int jbase = triangleSize(ij.getOffset());
final int sizej = csize.intValue(ij);
scratch[jbase + miny] = linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min);
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
return new PointerHierarchyRepresentationResult(ids, parent, height, dq.getDistanceFunction().isSquared());
}
use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.
the class StratifiedCrossValidation method initialize.
@Override
public void initialize(MultipleObjectsBundle bundle) {
super.initialize(bundle);
fold = 0;
IntArrayList[] classBuckets = new IntArrayList[this.labels.size()];
for (int i = 0; i < this.labels.size(); i++) {
classBuckets[i] = new IntArrayList();
}
for (int i = 0, l = bundle.dataLength(); i < l; ++i) {
ClassLabel label = (ClassLabel) bundle.data(i, labelcol);
if (label == null) {
throw new AbortException("Unlabeled instances currently not supported.");
}
int classIndex = Collections.binarySearch(labels, label);
if (classIndex < 0) {
throw new AbortException("Label not in label list: " + label);
}
classBuckets[classIndex].add(i);
}
// TODO: shuffle the class buckets?
sizes = new int[nfold];
assignment = new int[bundle.dataLength()];
for (IntArrayList bucket : classBuckets) {
for (int i = 0; i < bucket.size(); i++) {
assignment[bucket.getInt(i)] = i % nfold;
}
}
}
use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.
the class BestOfMultipleKMeans method run.
@Override
public Clustering<M> run(Database database, Relation<V> relation) {
if (!(innerkMeans.getDistanceFunction() instanceof PrimitiveDistanceFunction)) {
throw new AbortException("K-Means results can only be evaluated for primitive distance functions, got: " + innerkMeans.getDistanceFunction().getClass());
}
@SuppressWarnings("unchecked") final NumberVectorDistanceFunction<? super NumberVector> df = (NumberVectorDistanceFunction<? super NumberVector>) innerkMeans.getDistanceFunction();
Clustering<M> bestResult = null;
double bestCost = Double.NaN;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("K-means iterations", trials, LOG) : null;
for (int i = 0; i < trials; i++) {
Clustering<M> currentCandidate = innerkMeans.run(database, relation);
double currentCost = qualityMeasure.quality(currentCandidate, df, relation);
if (LOG.isVerbose()) {
LOG.verbose("Cost of candidate " + i + ": " + currentCost);
}
if (qualityMeasure.isBetter(currentCost, bestCost)) {
bestResult = currentCandidate;
bestCost = currentCost;
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
return bestResult;
}
use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.
the class CLARANS method run.
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("CLARANS Clustering", "clarans-clustering");
}
if (k * 2 >= relation.size()) {
// Random sampling of non-medoids will be slow for huge k
LOG.warning("A very large k was chosen. This implementation is not optimized for this case.");
}
DBIDs ids = relation.getDBIDs();
DistanceQuery<V> distQ = database.getDistanceQuery(relation, getDistanceFunction());
final boolean metric = getDistanceFunction().isMetric();
// Number of retries, relative rate, or absolute count:
final int retries = (int) Math.ceil(maxneighbor < 1 ? maxneighbor * ids.size() : maxneighbor);
Random rnd = random.getSingleThreadedRandom();
// Might copy!
DBIDArrayIter cand = DBIDUtil.ensureArray(ids).iter();
// Setup cluster assignment store
Assignment best = new Assignment(distQ, ids, DBIDUtil.newArray(k));
Assignment curr = new Assignment(distQ, ids, DBIDUtil.newArray(k));
Assignment scratch = new Assignment(distQ, ids, DBIDUtil.newArray(k));
// 1. initialize
double bestscore = Double.POSITIVE_INFINITY;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("CLARANS sampling restarts", numlocal, LOG) : null;
for (int i = 0; i < numlocal; i++) {
// 2. choose random initial medoids
// TODO: should we always use uniform sampling, to be closer to the paper?
curr.medoids.clear();
curr.medoids.addDBIDs(DBIDUtil.randomSample(ids, k, random));
// Cost of initial solution:
double total = curr.assignToNearestCluster();
// 3. Set j to 1.
int j = 1;
step: while (j < retries) {
// 4 part a. choose a random non-medoid (~ neighbor in G):
for (int r = 0; ; r++) {
// Random point
cand.seek(rnd.nextInt(ids.size()));
if (curr.nearest.doubleValue(cand) > 0) {
// Good: not a medoid.
break;
}
// We may have many duplicate points
if (metric && curr.second.doubleValue(cand) == 0) {
// Cannot yield an improvement if we are metric.
++j;
continue step;
} else if (!metric && !curr.medoids.contains(cand)) {
// Probably not a good candidate, but try nevertheless
break;
}
if (r >= 1000) {
throw new AbortException("Failed to choose a non-medoid in 1000 attempts. Choose k << N.");
}
// else: this must be the medoid.
}
// 4 part b. choose a random medoid to replace:
final int otherm = rnd.nextInt(k);
// 5. check lower cost
double cost = curr.computeCostDifferential(cand, otherm, scratch);
if (!(cost < 0)) {
// 6. try again
++j;
continue;
}
// cost is negative!
total += cost;
// Swap:
Assignment tmp = curr;
curr = scratch;
scratch = tmp;
j = 1;
}
// New best:
if (total < bestscore) {
// Swap:
Assignment tmp = curr;
curr = best;
best = tmp;
bestscore = total;
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
ArrayModifiableDBIDs[] clusters = ClusteringAlgorithmUtil.partitionsFromIntegerLabels(ids, best.assignment, k);
// Wrap result
Clustering<MedoidModel> result = new Clustering<>("CLARANS Clustering", "clarans-clustering");
for (DBIDArrayIter it = best.medoids.iter(); it.valid(); it.advance()) {
MedoidModel model = new MedoidModel(DBIDUtil.deref(it));
result.addToplevelCluster(new Cluster<>(clusters[it.getOffset()], model));
}
return result;
}
use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.
the class PAMInitialMeans method chooseInitialMedoids.
@Override
public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distQ) {
ArrayModifiableDBIDs medids = DBIDUtil.newArray(k);
DBIDVar bestid = DBIDUtil.newVar();
// We need three temporary storage arrays:
WritableDoubleDataStore mindist, bestd, tempd;
mindist = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
bestd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
tempd = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
// First mean is chosen by having the smallest distance sum to all others.
{
double best = Double.POSITIVE_INFINITY;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Choosing initial mean", ids.size(), LOG) : null;
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
double sum = 0, d;
for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
sum += d = distQ.distance(iter, iter2);
tempd.putDouble(iter2, d);
}
if (sum < best) {
best = sum;
bestid.set(iter);
// Swap mindist and newd:
WritableDoubleDataStore temp = mindist;
mindist = tempd;
tempd = temp;
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
medids.add(bestid);
}
assert (mindist != null);
// Subsequent means optimize the full criterion.
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Choosing initial centers", k, LOG) : null;
// First one was just chosen.
LOG.incrementProcessed(prog);
for (int i = 1; i < k; i++) {
double best = Double.POSITIVE_INFINITY;
bestid.unset();
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
if (medids.contains(iter)) {
continue;
}
double sum = 0., v;
for (DBIDIter iter2 = ids.iter(); iter2.valid(); iter2.advance()) {
sum += v = MathUtil.min(distQ.distance(iter, iter2), mindist.doubleValue(iter2));
tempd.put(iter2, v);
}
if (sum < best) {
best = sum;
bestid.set(iter);
// Swap bestd and newd:
WritableDoubleDataStore temp = bestd;
bestd = tempd;
tempd = temp;
}
}
if (!bestid.isSet()) {
throw new AbortException("No median found that improves the criterion function?!? Too many infinite distances.");
}
medids.add(bestid);
// Swap bestd and mindist:
WritableDoubleDataStore temp = bestd;
bestd = mindist;
mindist = temp;
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
mindist.destroy();
bestd.destroy();
tempd.destroy();
return medids;
}
Aggregations