use of de.lmu.ifi.dbs.elki.database.ids.DBIDMIter in project elki by elki-project.
the class FastutilIntOpenHashSetModifiableDBIDs method retainAll.
@Override
public boolean retainAll(DBIDs set) {
boolean modified = false;
for (DBIDMIter it = iter(); it.valid(); it.advance()) {
if (!set.contains(it)) {
it.remove();
modified = true;
}
}
return modified;
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDMIter in project elki by elki-project.
the class CanopyPreClustering method run.
/**
* Run the algorithm
*
* @param database Database
* @param relation Relation to process
*/
public Clustering<PrototypeModel<O>> run(Database database, Relation<O> relation) {
if (!(t1 >= t2)) {
throw new AbortException("T1 must be at least as large as T2.");
}
DistanceQuery<O> dq = database.getDistanceQuery(relation, getDistanceFunction());
ModifiableDBIDs ids = DBIDUtil.newHashSet(relation.getDBIDs());
ArrayList<Cluster<PrototypeModel<O>>> clusters = new ArrayList<>();
final int size = relation.size();
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Canopy clustering", size, LOG) : null;
DBIDVar first = DBIDUtil.newVar();
while (!ids.isEmpty()) {
// Remove first element:
ids.pop(first);
// Start a new cluster:
ModifiableDBIDs cids = DBIDUtil.newArray();
cids.add(first);
// Compare to remaining objects:
for (DBIDMIter iter = ids.iter(); iter.valid(); iter.advance()) {
double dist = dq.distance(first, iter);
// Inclusion threshold:
if (dist > t1) {
continue;
}
cids.add(iter);
// Removal threshold:
if (dist <= t2) {
iter.remove();
}
}
// TODO: remember the central object using a CanopyModel?
// Construct cluster:
clusters.add(new Cluster<>(cids, new SimplePrototypeModel<>(relation.get(first))));
if (prog != null) {
prog.setProcessed(size - ids.size(), LOG);
}
}
LOG.ensureCompleted(prog);
return new Clustering<>("Canopy clustering", "canopy-clustering", clusters);
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDMIter in project elki by elki-project.
the class SameSizeKMeansAlgorithm method refineResult.
/**
* Perform k-means style iterations to improve the clustering result.
*
* @param relation Data relation
* @param means Means list
* @param clusters Cluster list
* @param metas Metadata storage
* @param tids DBIDs array
* @return final means
*/
protected double[][] refineResult(Relation<V> relation, double[][] means, List<ModifiableDBIDs> clusters, final WritableDataStore<Meta> metas, ArrayModifiableDBIDs tids) {
NumberVectorDistanceFunction<? super V> df = getDistanceFunction();
// Our desired cluster size:
// rounded down
final int minsize = tids.size() / k;
// rounded up
final int maxsize = (tids.size() + k - 1) / k;
// Comparator: sort by largest gain by transfer
final Comparator<DBIDRef> comp = new Comparator<DBIDRef>() {
@Override
public int compare(DBIDRef o1, DBIDRef o2) {
Meta c1 = metas.get(o1), c2 = metas.get(o2);
return Double.compare(c1.priority(), c2.priority());
}
};
// List for sorting cluster preferences
final int[] preferences = MathUtil.sequence(0, k);
// Comparator for this list.
final PreferenceComparator pcomp = new PreferenceComparator();
// Initialize transfer lists:
ArrayModifiableDBIDs[] transfers = new ArrayModifiableDBIDs[k];
for (int i = 0; i < k; i++) {
transfers[i] = DBIDUtil.newArray();
}
DBIDArrayIter id = tids.iter();
for (int iter = 0; maxiter <= 0 || iter < maxiter; iter++) {
updateDistances(relation, means, metas, df);
tids.sort(comp);
// Track if anything has changed
int active = 0;
for (id.seek(0); id.valid(); id.advance()) {
Meta c = metas.get(id);
IntegerArrayQuickSort.sort(preferences, pcomp.select(c));
ModifiableDBIDs source = clusters.get(c.primary);
assert (source.contains(id));
tloop: for (int i : preferences) {
if (i == c.primary) {
// Already assigned here
continue;
}
ModifiableDBIDs dest = clusters.get(i);
// Can we pair this transfer?
final double gain = c.gain(i);
for (DBIDMIter other = transfers[i].iter(); other.valid(); other.advance()) {
Meta c2 = metas.get(other);
if (gain + c2.gain(c.primary) > 0) {
transfer(metas, c2, dest, source, other, c.primary);
transfer(metas, c, source, dest, id, i);
active += 2;
// last, as this invalidates the reference!
other.remove();
// We are assigned here now.
source = dest;
// Can try another transfer, with next cluster.
continue tloop;
}
}
// If cluster sizes allow, move a single object.
if (gain > 0 && (dest.size() < maxsize && source.size() > minsize)) {
transfer(metas, c, source, dest, id, i);
active += 1;
// We are assigned here now.
source = dest;
continue tloop;
}
}
// transfer list.
if (c.primary != preferences[0] && c.dists[c.primary] > c.dists[preferences[0]]) {
transfers[c.primary].add(id);
}
}
// TODO: try to get more transfers out of the transfer lists done by
// considering more than one object?
int pending = 0;
// Clear transfer lists for next iteration.
for (int i = 0; i < k; i++) {
pending += transfers[i].size();
transfers[i].clear();
}
if (LOG.isDebuggingFine()) {
LOG.debugFine("Iteration #" + iter + ": performed " + active + " transfers skipped " + pending);
}
if (active <= 0) {
break;
}
// Recompute means after reassignment
means = means(clusters, means, relation);
}
return means;
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDMIter in project elki by elki-project.
the class P3C method findOutliers.
/**
* Performs outlier detection by testing the Mahalanobis distance of each
* point in a cluster against the critical value of the ChiSquared
* distribution with as many degrees of freedom as the cluster has relevant
* attributes.
*
* @param relation Data relation
* @param models Cluster models
* @param clusterCandidates the list of clusters to check.
* @param noise the set to which to add points deemed outliers.
*/
private void findOutliers(Relation<V> relation, List<MultivariateGaussianModel> models, ArrayList<ClusterCandidate> clusterCandidates, ModifiableDBIDs noise) {
Iterator<MultivariateGaussianModel> it = models.iterator();
for (int c = 0; it.hasNext(); c++) {
MultivariateGaussianModel model = it.next();
final ClusterCandidate candidate = clusterCandidates.get(c);
final int dof = BitsUtil.cardinality(candidate.dimensions);
final double threshold = ChiSquaredDistribution.quantile(1 - alpha, dof);
for (DBIDMIter iter = candidate.ids.iter(); iter.valid(); iter.advance()) {
final double distance = model.mahalanobisDistance(relation.get(iter));
if (distance >= threshold) {
// Outlier, remove it and add it to the outlier set.
noise.add(iter);
iter.remove();
}
}
}
}
Aggregations