use of de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter in project elki by elki-project.
the class EvaluateDBCV method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param cl Clustering
*
* @return dbcv DBCV-index
*/
public double evaluateClustering(Database db, Relation<O> rel, Clustering<?> cl) {
final DistanceQuery<O> dq = rel.getDistanceQuery(distanceFunction);
List<? extends Cluster<?>> clusters = cl.getAllClusters();
final int numc = clusters.size();
// DBCV needs a "dimensionality".
@SuppressWarnings("unchecked") final Relation<? extends SpatialComparable> vrel = (Relation<? extends SpatialComparable>) rel;
final int dim = RelationUtil.dimensionality(vrel);
// precompute all core distances
ArrayDBIDs[] cids = new ArrayDBIDs[numc];
double[][] coreDists = new double[numc][];
for (int c = 0; c < numc; c++) {
Cluster<?> cluster = clusters.get(c);
// Singletons are considered as Noise, because they have no sparseness
if (cluster.isNoise() || cluster.size() < 2) {
coreDists[c] = null;
continue;
}
// Store for use below:
ArrayDBIDs ids = cids[c] = DBIDUtil.ensureArray(cluster.getIDs());
double[] clusterCoreDists = coreDists[c] = new double[ids.size()];
for (DBIDArrayIter it = ids.iter(), it2 = ids.iter(); it.valid(); it.advance()) {
double currentCoreDist = 0;
int neighbors = 0;
for (it2.seek(0); it2.valid(); it2.advance()) {
if (DBIDUtil.equal(it, it2)) {
continue;
}
double dist = dq.distance(it, it2);
// We ignore such objects.
if (dist > 0) {
currentCoreDist += MathUtil.powi(1. / dist, dim);
++neighbors;
}
}
// Average, and undo power.
clusterCoreDists[it.getOffset()] = FastMath.pow(currentCoreDist / neighbors, -1. / dim);
}
}
// compute density sparseness of all clusters
int[][] clusterDegrees = new int[numc][];
double[] clusterDscMax = new double[numc];
// describes if a cluster contains any internal edges
boolean[] internalEdges = new boolean[numc];
for (int c = 0; c < numc; c++) {
Cluster<?> cluster = clusters.get(c);
if (cluster.isNoise() || cluster.size() < 2) {
clusterDegrees[c] = null;
clusterDscMax[c] = Double.NaN;
continue;
}
double[] clusterCoreDists = coreDists[c];
ArrayDBIDs ids = cids[c];
// Density Sparseness of the Cluster
double dscMax = 0;
double[][] distances = new double[cluster.size()][cluster.size()];
// create mutability distance matrix for Minimum Spanning Tree
for (DBIDArrayIter it = ids.iter(), it2 = ids.iter(); it.valid(); it.advance()) {
double currentCoreDist = clusterCoreDists[it.getOffset()];
for (it2.seek(it.getOffset() + 1); it2.valid(); it2.advance()) {
double mutualReachDist = MathUtil.max(currentCoreDist, clusterCoreDists[it2.getOffset()], dq.distance(it, it2));
distances[it.getOffset()][it2.getOffset()] = mutualReachDist;
distances[it2.getOffset()][it.getOffset()] = mutualReachDist;
}
}
// generate Minimum Spanning Tree
int[] nodes = PrimsMinimumSpanningTree.processDense(distances);
// get degree of all nodes in the spanning tree
int[] degree = new int[cluster.size()];
for (int i = 0; i < nodes.length; i++) {
degree[nodes[i]]++;
}
// check if cluster contains any internal edges
for (int i = 0; i < nodes.length; i += 2) {
if (degree[nodes[i]] > 1 && degree[nodes[i + 1]] > 1) {
internalEdges[c] = true;
}
}
clusterDegrees[c] = degree;
// find maximum sparseness in the Minimum Spanning Tree
for (int i = 0; i < nodes.length; i = i + 2) {
final int n1 = nodes[i], n2 = nodes[i + 1];
// If a cluster has no internal nodes we consider all edges.
if (distances[n1][n2] > dscMax && (!internalEdges[c] || (degree[n1] > 1 && degree[n2] > 1))) {
dscMax = distances[n1][n2];
}
}
clusterDscMax[c] = dscMax;
}
// compute density separation of all clusters
double dbcv = 0;
for (int c = 0; c < numc; c++) {
Cluster<?> cluster = clusters.get(c);
if (cluster.isNoise() || cluster.size() < 2) {
continue;
}
double currentDscMax = clusterDscMax[c];
double[] clusterCoreDists = coreDists[c];
int[] currentDegree = clusterDegrees[c];
// minimal Density Separation of the Cluster
double dspcMin = Double.POSITIVE_INFINITY;
for (DBIDArrayIter it = cids[c].iter(); it.valid(); it.advance()) {
// nodes.
if (currentDegree[it.getOffset()] < 2 && internalEdges[c]) {
continue;
}
double currentCoreDist = clusterCoreDists[it.getOffset()];
for (int oc = 0; oc < numc; oc++) {
Cluster<?> ocluster = clusters.get(oc);
if (ocluster.isNoise() || ocluster.size() < 2 || cluster == ocluster) {
continue;
}
int[] oDegree = clusterDegrees[oc];
double[] oclusterCoreDists = coreDists[oc];
for (DBIDArrayIter it2 = cids[oc].iter(); it2.valid(); it2.advance()) {
if (oDegree[it2.getOffset()] < 2 && internalEdges[oc]) {
continue;
}
double mutualReachDist = MathUtil.max(currentCoreDist, oclusterCoreDists[it2.getOffset()], dq.distance(it, it2));
dspcMin = mutualReachDist < dspcMin ? mutualReachDist : dspcMin;
}
}
}
// compute DBCV
double vc = (dspcMin - currentDscMax) / MathUtil.max(dspcMin, currentDscMax);
double weight = cluster.size() / (double) rel.size();
dbcv += weight * vc;
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), cl, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("Density Based Clustering Validation", dbcv, 0., Double.POSITIVE_INFINITY, 0., true);
db.getHierarchy().resultChanged(ev);
return dbcv;
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter in project elki by elki-project.
the class EvaluateSilhouette method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param dq Distance query
* @param c Clustering
* @return Average silhouette
*/
public double evaluateClustering(Database db, Relation<O> rel, DistanceQuery<O> dq, Clustering<?> c) {
List<? extends Cluster<?>> clusters = c.getAllClusters();
MeanVariance msil = new MeanVariance();
int ignorednoise = 0;
for (Cluster<?> cluster : clusters) {
// Note: we treat 1-element clusters the same as noise.
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
ignorednoise += cluster.size();
// Ignore noise elements
continue;
case TREAT_NOISE_AS_SINGLETONS:
// As suggested in Rousseeuw, we use 0 for singletons.
msil.put(0., cluster.size());
continue;
case MERGE_NOISE:
// Treat as cluster below
break;
}
}
ArrayDBIDs ids = DBIDUtil.ensureArray(cluster.getIDs());
// temporary storage.
double[] as = new double[ids.size()];
DBIDArrayIter it1 = ids.iter(), it2 = ids.iter();
for (it1.seek(0); it1.valid(); it1.advance()) {
// a: In-cluster distances
// Already computed distances
double a = as[it1.getOffset()];
for (it2.seek(it1.getOffset() + 1); it2.valid(); it2.advance()) {
final double dist = dq.distance(it1, it2);
a += dist;
as[it2.getOffset()] += dist;
}
a /= (ids.size() - 1);
// b: minimum average distance to other clusters:
double b = Double.POSITIVE_INFINITY;
for (Cluster<?> ocluster : clusters) {
if (ocluster == /* yes, reference identity */
cluster) {
// Same cluster
continue;
}
if (ocluster.size() <= 1 || ocluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
// Ignore noise elements
continue;
case TREAT_NOISE_AS_SINGLETONS:
// Treat noise cluster as singletons:
for (DBIDIter it3 = ocluster.getIDs().iter(); it3.valid(); it3.advance()) {
final double dist = dq.distance(it1, it3);
// Minimum average
b = dist < b ? dist : b;
}
continue;
case MERGE_NOISE:
// Treat as cluster below
break;
}
}
final DBIDs oids = ocluster.getIDs();
double btmp = 0.;
for (DBIDIter it3 = oids.iter(); it3.valid(); it3.advance()) {
btmp += dq.distance(it1, it3);
}
// Average
btmp /= oids.size();
// Minimum average
b = btmp < b ? btmp : b;
}
// One cluster only?
b = b < Double.POSITIVE_INFINITY ? b : a;
msil.put((b - a) / (b > a ? b : a));
}
}
double penalty = 1.;
// Only if {@link NoiseHandling#IGNORE_NOISE}:
if (penalize && ignorednoise > 0) {
penalty = (rel.size() - ignorednoise) / (double) rel.size();
}
final double meansil = penalty * msil.getMean();
final double stdsil = penalty * msil.getSampleStddev();
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(key + ".silhouette.noise-handling", noiseOption.toString()));
if (ignorednoise > 0) {
LOG.statistics(new LongStatistic(key + ".silhouette.noise", ignorednoise));
}
LOG.statistics(new DoubleStatistic(key + ".silhouette.mean", meansil));
LOG.statistics(new DoubleStatistic(key + ".silhouette.stddev", stdsil));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("Silhouette +-" + FormatUtil.NF2.format(stdsil), meansil, -1., 1., 0., false);
db.getHierarchy().resultChanged(ev);
return meansil;
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter in project elki by elki-project.
the class PrecomputedSimilarityMatrix method initialize.
@Override
public void initialize() {
DBIDs rids = relation.getDBIDs();
if (!(rids instanceof DBIDRange)) {
throw new AbortException("Similarity matrixes are currently only supported for DBID ranges (as used by static databases) for performance reasons (Patches welcome).");
}
ids = (DBIDRange) rids;
size = ids.size();
if (size > 65536) {
throw new AbortException("Similarity matrixes currently have a limit of 65536 objects (~16 GB). After this, the array size exceeds the Java integer range, and a different data structure needs to be used.");
}
similarityQuery = similarityFunction.instantiate(relation);
int msize = triangleSize(size);
matrix = new double[msize];
DBIDArrayIter ix = ids.iter(), iy = ids.iter();
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Precomputing similarity matrix", msize, LOG) : null;
int pos = 0;
for (ix.seek(0); ix.valid(); ix.advance()) {
// y < x -- must match {@link #getOffset}!
for (iy.seek(0); iy.getOffset() < ix.getOffset(); iy.advance()) {
matrix[pos] = similarityQuery.similarity(ix, iy);
pos++;
}
if (prog != null) {
prog.setProcessed(prog.getProcessed() + ix.getOffset(), LOG);
}
}
LOG.ensureCompleted(prog);
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter in project elki by elki-project.
the class PrecomputeDistancesAsciiApplication method run.
@Override
public void run() {
database.initialize();
Relation<O> relation = database.getRelation(distance.getInputTypeRestriction());
DistanceQuery<O> distanceQuery = database.getDistanceQuery(relation, distance);
DBIDRange ids = DBIDUtil.assertRange(relation.getDBIDs());
final int size = ids.size();
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Precomputing distances", (int) (((size - 1) * (long) size) >>> 1), LOG) : null;
try (PrintStream fout = openStream(out)) {
DBIDArrayIter id1 = ids.iter(), id2 = ids.iter();
for (; id1.valid(); id1.advance()) {
String idstr1 = Integer.toString(id1.getOffset());
if (debugExtraCheckSymmetry && distanceQuery.distance(id1, id1) != 0.) {
LOG.warning("Distance function doesn't satisfy d(0,0) = 0.");
}
for (id2.seek(id1.getOffset() + 1); id2.valid(); id2.advance()) {
double d = distanceQuery.distance(id1, id2);
if (debugExtraCheckSymmetry) {
double d2 = distanceQuery.distance(id2, id1);
if (Math.abs(d - d2) > 0.0000001) {
LOG.warning("Distance function doesn't appear to be symmetric!");
}
}
//
fout.append(idstr1).append('\t').append(Integer.toString(id2.getOffset())).append(//
'\t').append(Double.toString(d)).append('\n');
}
if (prog != null) {
prog.setProcessed(prog.getProcessed() + (size - id1.getOffset() - 1), LOG);
}
}
} catch (IOException e) {
throw new AbortException("Could not write to output file.", e);
}
LOG.ensureCompleted(prog);
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter in project elki by elki-project.
the class CacheFloatDistanceInOnDiskMatrix method run.
@Override
public void run() {
database.initialize();
Relation<O> relation = database.getRelation(distance.getInputTypeRestriction());
DistanceQuery<O> distanceQuery = database.getDistanceQuery(relation, distance);
DBIDRange ids = DBIDUtil.assertRange(relation.getDBIDs());
int size = ids.size();
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Precomputing distances", (int) (((size + 1) * (long) size) >>> 1), LOG) : null;
try (//
OnDiskUpperTriangleMatrix matrix = new OnDiskUpperTriangleMatrix(out, DiskCacheBasedFloatDistanceFunction.FLOAT_CACHE_MAGIC, 0, ByteArrayUtil.SIZE_FLOAT, size)) {
DBIDArrayIter id1 = ids.iter(), id2 = ids.iter();
for (; id1.valid(); id1.advance()) {
for (id2.seek(id1.getOffset()); id2.valid(); id2.advance()) {
float d = (float) distanceQuery.distance(id1, id2);
if (debugExtraCheckSymmetry) {
float d2 = (float) distanceQuery.distance(id2, id1);
if (Math.abs(d - d2) > 0.0000001) {
LOG.warning("Distance function doesn't appear to be symmetric!");
}
}
try {
matrix.getRecordBuffer(id1.getOffset(), id2.getOffset()).putFloat(d);
} catch (IOException e) {
throw new AbortException("Error writing distance record " + DBIDUtil.toString(id1) + "," + DBIDUtil.toString(id2) + " to matrix.", e);
}
}
if (prog != null) {
prog.setProcessed(prog.getProcessed() + (size - id1.getOffset()), LOG);
}
}
} catch (IOException e) {
throw new AbortException("Error precomputing distance matrix.", e);
}
prog.ensureCompleted(LOG);
}
Aggregations