use of de.lmu.ifi.dbs.elki.database.ids.DBIDVar in project elki by elki-project.
the class CTLuGLSBackwardSearchAlgorithm method singleIteration.
/**
* Run a single iteration of the GLS-SOD modeling step
*
* @param relationx Geo relation
* @param relationy Attribute relation
* @return Top outlier and associated score
*/
private Pair<DBIDVar, Double> singleIteration(Relation<V> relationx, Relation<? extends NumberVector> relationy) {
final int dim = RelationUtil.dimensionality(relationx);
final int dimy = RelationUtil.dimensionality(relationy);
assert (dim == 2);
KNNQuery<V> knnQuery = QueryUtil.getKNNQuery(relationx, getDistanceFunction(), k + 1);
// We need stable indexed DBIDs
ArrayModifiableDBIDs ids = DBIDUtil.newArray(relationx.getDBIDs());
// Sort, so we can do a binary search below.
ids.sort();
// init F,X,Z
double[][] X = new double[ids.size()][6];
double[][] F = new double[ids.size()][ids.size()];
double[][] Y = new double[ids.size()][dimy];
{
int i = 0;
for (DBIDIter id = ids.iter(); id.valid(); id.advance(), i++) {
// Fill the data matrix
{
V vec = relationx.get(id);
double la = vec.doubleValue(0);
double lo = vec.doubleValue(1);
X[i][0] = 1.0;
X[i][1] = la;
X[i][2] = lo;
X[i][3] = la * lo;
X[i][4] = la * la;
X[i][5] = lo * lo;
}
{
final NumberVector vecy = relationy.get(id);
for (int d = 0; d < dimy; d++) {
double idy = vecy.doubleValue(d);
Y[i][d] = idy;
}
}
// Fill the neighborhood matrix F:
{
KNNList neighbors = knnQuery.getKNNForDBID(id, k + 1);
ModifiableDBIDs neighborhood = DBIDUtil.newArray(neighbors.size());
for (DBIDIter neighbor = neighbors.iter(); neighbor.valid(); neighbor.advance()) {
if (DBIDUtil.equal(id, neighbor)) {
continue;
}
neighborhood.add(neighbor);
}
// Weight object itself positively.
F[i][i] = 1.0;
final int nweight = -1 / neighborhood.size();
// unfortunately.
for (DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) {
int pos = ids.binarySearch(iter);
assert (pos >= 0);
F[pos][i] = nweight;
}
}
}
}
// Estimate the parameter beta
// Common term that we can save recomputing.
double[][] common = times(transposeTimesTranspose(X, F), F);
double[][] b = times(inverse(times(common, X)), times(common, Y));
// Estimate sigma_0 and sigma:
// sigma_sum_square = sigma_0*sigma_0 + sigma*sigma
double[][] sigmaMat = times(F, minusEquals(times(X, b), times(F, Y)));
final double sigma_sum_square = normF(sigmaMat) / (relationx.size() - 6 - 1);
final double norm = 1 / FastMath.sqrt(sigma_sum_square);
// calculate the absolute values of standard residuals
double[][] E = timesEquals(times(F, minus(Y, times(X, b))), norm);
DBIDVar worstid = DBIDUtil.newVar();
double worstscore = Double.NEGATIVE_INFINITY;
int i = 0;
for (DBIDIter id = ids.iter(); id.valid(); id.advance(), i++) {
double err = squareSum(getRow(E, i));
// double err = Math.abs(E.get(i, 0));
if (err > worstscore) {
worstscore = err;
worstid.set(id);
}
}
return new Pair<>(worstid, FastMath.sqrt(worstscore));
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDVar in project elki by elki-project.
the class CanopyPreClustering method run.
/**
* Run the algorithm
*
* @param database Database
* @param relation Relation to process
*/
public Clustering<PrototypeModel<O>> run(Database database, Relation<O> relation) {
if (!(t1 >= t2)) {
throw new AbortException("T1 must be at least as large as T2.");
}
DistanceQuery<O> dq = database.getDistanceQuery(relation, getDistanceFunction());
ModifiableDBIDs ids = DBIDUtil.newHashSet(relation.getDBIDs());
ArrayList<Cluster<PrototypeModel<O>>> clusters = new ArrayList<>();
final int size = relation.size();
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Canopy clustering", size, LOG) : null;
DBIDVar first = DBIDUtil.newVar();
while (!ids.isEmpty()) {
// Remove first element:
ids.pop(first);
// Start a new cluster:
ModifiableDBIDs cids = DBIDUtil.newArray();
cids.add(first);
// Compare to remaining objects:
for (DBIDMIter iter = ids.iter(); iter.valid(); iter.advance()) {
double dist = dq.distance(first, iter);
// Inclusion threshold:
if (dist > t1) {
continue;
}
cids.add(iter);
// Removal threshold:
if (dist <= t2) {
iter.remove();
}
}
// TODO: remember the central object using a CanopyModel?
// Construct cluster:
clusters.add(new Cluster<>(cids, new SimplePrototypeModel<>(relation.get(first))));
if (prog != null) {
prog.setProcessed(size - ids.size(), LOG);
}
}
LOG.ensureCompleted(prog);
return new Clustering<>("Canopy clustering", "canopy-clustering", clusters);
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDVar in project elki by elki-project.
the class SNNClustering method expandCluster.
/**
* DBSCAN-function expandCluster adapted to SNN criterion.
* <p/>
* <p/>
* Border-Objects become members of the first possible cluster.
*
* @param snnInstance shared nearest neighbors
* @param startObjectID potential seed of a new potential cluster
* @param objprog the progress object to report about the progress of
* clustering
*/
protected void expandCluster(SimilarityQuery<O> snnInstance, DBIDRef startObjectID, FiniteProgress objprog, IndefiniteProgress clusprog) {
ArrayModifiableDBIDs seeds = findSNNNeighbors(snnInstance, startObjectID);
// startObject is no core-object
if (seeds.size() < minpts) {
noise.add(startObjectID);
processedIDs.add(startObjectID);
if (objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), LOG);
clusprog.setProcessed(resultList.size(), LOG);
}
return;
}
// try to expand the cluster
ModifiableDBIDs currentCluster = DBIDUtil.newArray();
for (DBIDIter seed = seeds.iter(); seed.valid(); seed.advance()) {
if (!processedIDs.contains(seed)) {
currentCluster.add(seed);
processedIDs.add(seed);
} else if (noise.contains(seed)) {
currentCluster.add(seed);
noise.remove(seed);
}
}
DBIDVar o = DBIDUtil.newVar();
while (seeds.size() > 0) {
seeds.pop(o);
ArrayModifiableDBIDs neighborhood = findSNNNeighbors(snnInstance, o);
if (neighborhood.size() >= minpts) {
for (DBIDIter iter = neighborhood.iter(); iter.valid(); iter.advance()) {
boolean inNoise = noise.contains(iter);
boolean unclassified = !processedIDs.contains(iter);
if (inNoise || unclassified) {
if (unclassified) {
seeds.add(iter);
}
currentCluster.add(iter);
processedIDs.add(iter);
if (inNoise) {
noise.remove(iter);
}
}
}
}
if (objprog != null && clusprog != null) {
objprog.setProcessed(processedIDs.size(), LOG);
int numClusters = currentCluster.size() > minpts ? resultList.size() + 1 : resultList.size();
clusprog.setProcessed(numClusters, LOG);
}
if (processedIDs.size() == snnInstance.getRelation().size() && noise.size() == 0) {
break;
}
}
if (currentCluster.size() >= minpts) {
resultList.add(currentCluster);
} else {
noise.addDBIDs(currentCluster);
noise.add(startObjectID);
processedIDs.add(startObjectID);
}
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDVar in project elki by elki-project.
the class DBSCAN method expandCluster.
/**
* DBSCAN-function expandCluster.
*
* Border-Objects become members of the first possible cluster.
*
* @param relation Database relation to run on
* @param rangeQuery Range query to use
* @param startObjectID potential seed of a new potential cluster
* @param seeds Array to store the current seeds
* @param objprog Number of objects processed (may be {@code null})
* @param clusprog Number of clusters found (may be {@code null})
*/
protected void expandCluster(Relation<O> relation, RangeQuery<O> rangeQuery, DBIDRef startObjectID, ArrayModifiableDBIDs seeds, FiniteProgress objprog, IndefiniteProgress clusprog) {
DoubleDBIDList neighbors = rangeQuery.getRangeForDBID(startObjectID, epsilon);
ncounter += neighbors.size();
// startObject is no core-object
if (neighbors.size() < minpts) {
noise.add(startObjectID);
processedIDs.add(startObjectID);
if (objprog != null) {
objprog.incrementProcessed(LOG);
}
return;
}
ModifiableDBIDs currentCluster = DBIDUtil.newArray();
currentCluster.add(startObjectID);
processedIDs.add(startObjectID);
// try to expand the cluster
assert (seeds.size() == 0);
seeds.clear();
processNeighbors(neighbors.iter(), currentCluster, seeds);
DBIDVar o = DBIDUtil.newVar();
while (!seeds.isEmpty()) {
neighbors = rangeQuery.getRangeForDBID(seeds.pop(o), epsilon);
ncounter += neighbors.size();
if (neighbors.size() >= minpts) {
processNeighbors(neighbors.iter(), currentCluster, seeds);
}
if (objprog != null) {
objprog.incrementProcessed(LOG);
}
}
resultList.add(currentCluster);
if (clusprog != null) {
clusprog.setProcessed(resultList.size(), LOG);
}
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDVar in project elki by elki-project.
the class FarthestSumPointsInitialMeans method chooseInitialMeans.
@Override
public <T extends NumberVector> double[][] chooseInitialMeans(Database database, Relation<T> relation, int k, NumberVectorDistanceFunction<? super T> distanceFunction) {
// Get a distance query
DistanceQuery<T> distQ = database.getDistanceQuery(relation, distanceFunction);
DBIDs ids = relation.getDBIDs();
WritableDoubleDataStore store = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.);
// Chose first mean
List<T> means = new ArrayList<>(k);
DBIDRef first = DBIDUtil.randomSample(ids, rnd);
T prevmean = relation.get(first);
means.add(prevmean);
// Find farthest object each.
DBIDVar best = DBIDUtil.newVar(first);
for (int i = (dropfirst ? 0 : 1); i < k; i++) {
double maxdist = Double.NEGATIVE_INFINITY;
for (DBIDIter it = ids.iter(); it.valid(); it.advance()) {
final double prev = store.doubleValue(it);
if (prev != prev) {
// NaN: already chosen!
continue;
}
double dsum = prev + distQ.distance(prevmean, it);
// Don't store distance to first mean, when it will be dropped below.
if (i > 0) {
store.putDouble(it, dsum);
}
if (dsum > maxdist) {
maxdist = dsum;
best.set(it);
}
}
// Add new mean (and drop the initial mean when desired)
if (i == 0) {
// Remove temporary first element.
means.clear();
}
// So it won't be chosen twice.
store.putDouble(best, Double.NaN);
prevmean = relation.get(best);
means.add(prevmean);
}
// Explicitly destroy temporary data.
store.destroy();
return unboxVectors(means);
}
Aggregations