use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class KMeansMinusMinus method run.
@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
// Intialisieren der means
double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
// initialisieren vom Heap
final int heapsize = (int) (rate < 1. ? Math.ceil(relation.size() * rate) : rate);
DoubleMinHeap minHeap = new DoubleMinHeap(heapsize);
// Setup cluster assignment store
List<ModifiableDoubleDBIDList> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
}
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
double[] varsum = new double[k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
DoubleStatistic varstat = new DoubleStatistic(this.getClass().getName() + ".variance-sum");
// Otherwise, the vartotal break below will fail!
assert (varstat != null);
int iteration = 0;
double prevvartotal = Double.POSITIVE_INFINITY;
for (; maxiter <= 0 || iteration < maxiter; iteration++) {
minHeap.clear();
for (int i = 0; i < k; i++) {
clusters.get(i).clear();
}
LOG.incrementProcessed(prog);
boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum, minHeap, heapsize);
double vartotal = logVarstat(varstat, varsum);
// than the previous value.
if (!changed || vartotal > prevvartotal) {
break;
}
prevvartotal = vartotal;
// Recompute means.
means = meansWithTreshhold(clusters, means, relation, heapsize > 0 ? minHeap.peek() : Double.POSITIVE_INFINITY);
}
// create noisecluster if wanted
ModifiableDoubleDBIDList noiseids = null;
if (noiseFlag && heapsize > 0) {
clusters.add(noiseids = DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
double tresh = minHeap.peek();
for (int i = 0; i < k; i++) {
for (DoubleDBIDListMIter it = clusters.get(i).iter(); it.valid(); it.advance()) {
final double dist = it.doubleValue();
// Add to the noise cluster:
if (dist >= tresh) {
noiseids.add(dist, it);
assignment.putInt(it, k);
it.remove();
}
}
}
}
LOG.setCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
}
// Wrap result
Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
for (int i = 0; i < k; i++) {
DBIDs ids = clusters.get(i);
if (ids.size() == 0) {
continue;
}
KMeansModel model = new KMeansModel(means[i], varsum[i]);
result.addToplevelCluster(new Cluster<>(ids, model));
}
// Noise Cluster
if (noiseFlag) {
KMeansModel model = new KMeansModel(null, 0);
DBIDs ids = noiseids;
if (ids.size() == 0) {
return result;
}
result.addToplevelCluster(new Cluster<>(ids, true, model));
}
return result;
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class LogClusterSizes method logClusterSizes.
/**
* Log the cluster sizes of a clustering.
*
* @param c Clustering ot analyze
*/
public static <C extends Model> void logClusterSizes(Clustering<C> c) {
if (!LOG.isStatistics()) {
return;
}
final List<Cluster<C>> clusters = c.getAllClusters();
final int numc = clusters.size();
LOG.statistics(new StringStatistic(PREFIX + "name", c.getLongName()));
LOG.statistics(new LongStatistic(PREFIX + "clusters", numc));
Hierarchy<Cluster<C>> h = c.getClusterHierarchy();
int cnum = 0;
for (Cluster<C> clu : clusters) {
final String p = PREFIX + "cluster-" + cnum + ".";
if (clu.getName() != null) {
LOG.statistics(new StringStatistic(p + "name", clu.getName()));
}
LOG.statistics(new LongStatistic(p + "size", clu.size()));
if (clu.isNoise()) {
LOG.statistics(new StringStatistic(p + "noise", "true"));
}
if (h.numChildren(clu) > 0) {
// TODO: this only works if we have cluster names!
StringBuilder buf = new StringBuilder();
for (It<Cluster<C>> it = h.iterChildren(clu); it.valid(); it.advance()) {
if (buf.length() > 0) {
buf.append(", ");
}
buf.append(it.get().getName());
}
LOG.statistics(new StringStatistic(p + "children", buf.toString()));
}
// TODO: also log parents?
++cnum;
}
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class EvaluateConcordantPairs method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param c Clustering
* @return Gamma index
*/
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
List<? extends Cluster<?>> clusters = c.getAllClusters();
int ignorednoise = 0, withinPairs = 0;
for (Cluster<?> cluster : clusters) {
if ((cluster.size() <= 1 || cluster.isNoise())) {
switch(noiseHandling) {
case IGNORE_NOISE:
ignorednoise += cluster.size();
continue;
case TREAT_NOISE_AS_SINGLETONS:
// No concordant distances.
continue;
case MERGE_NOISE:
// Treat like a cluster below.
break;
}
}
withinPairs += (cluster.size() * (cluster.size() - 1)) >>> 1;
if (withinPairs < 0) {
throw new AbortException("Integer overflow - clusters too large to compute pairwise distances.");
}
}
// Materialize within-cluster distances (sorted):
double[] withinDistances = computeWithinDistances(rel, clusters, withinPairs);
int[] withinTies = new int[withinDistances.length];
// Count ties within
countTies(withinDistances, withinTies);
long concordantPairs = 0, discordantPairs = 0, betweenPairs = 0;
// Step two, compute discordant distances:
for (int i = 0; i < clusters.size(); i++) {
Cluster<?> ocluster1 = clusters.get(i);
if (//
(ocluster1.size() <= 1 || ocluster1.isNoise()) && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) {
continue;
}
for (int j = i + 1; j < clusters.size(); j++) {
Cluster<?> ocluster2 = clusters.get(j);
if (//
(ocluster2.size() <= 1 || ocluster2.isNoise()) && noiseHandling.equals(NoiseHandling.IGNORE_NOISE)) {
continue;
}
betweenPairs += ocluster1.size() * ocluster2.size();
for (DBIDIter oit1 = ocluster1.getIDs().iter(); oit1.valid(); oit1.advance()) {
NumberVector obj = rel.get(oit1);
for (DBIDIter oit2 = ocluster2.getIDs().iter(); oit2.valid(); oit2.advance()) {
double dist = distanceFunction.distance(obj, rel.get(oit2));
int p = Arrays.binarySearch(withinDistances, dist);
if (p >= 0) {
// Tied distances:
while (p > 0 && withinDistances[p - 1] >= dist) {
--p;
}
concordantPairs += p;
discordantPairs += withinDistances.length - p - withinTies[p];
continue;
}
p = -p - 1;
concordantPairs += p;
discordantPairs += withinDistances.length - p;
}
}
}
}
// Total number of pairs possible:
final long t = ((rel.size() - ignorednoise) * (long) (rel.size() - ignorednoise - 1)) >>> 1;
final long tt = (t * (t - 1)) >>> 1;
double gamma = (concordantPairs - discordantPairs) / (double) (concordantPairs + discordantPairs);
double tau = computeTau(concordantPairs, discordantPairs, tt, withinDistances.length, betweenPairs);
// Avoid NaN when everything is in a single cluster:
gamma = gamma > 0. ? gamma : 0.;
tau = tau > 0. ? tau : 0.;
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(key + ".pbm.noise-handling", noiseHandling.toString()));
if (ignorednoise > 0) {
LOG.statistics(new LongStatistic(key + ".pbm.ignored", ignorednoise));
}
LOG.statistics(new DoubleStatistic(key + ".gamma", gamma));
LOG.statistics(new DoubleStatistic(key + ".tau", tau));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Concordance-based Evaluation");
g.addMeasure("Gamma", gamma, -1., 1., 0., false);
g.addMeasure("Tau", tau, -1., +1., 0., false);
db.getHierarchy().resultChanged(ev);
return gamma;
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class EvaluateDaviesBouldin method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param c Clustering
* @return DB-index
*/
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
List<? extends Cluster<?>> clusters = c.getAllClusters();
NumberVector[] centroids = new NumberVector[clusters.size()];
int noisecount = EvaluateSimplifiedSilhouette.centroids(rel, clusters, centroids, noiseOption);
double[] withinGroupDistance = withinGroupDistances(rel, clusters, centroids);
Mean daviesBouldin = new Mean();
for (int i = 0; i < clusters.size(); i++) {
final NumberVector centroid = centroids[i];
final double withinGroupDistancei = withinGroupDistance[i];
// maximum within-to-between cluster spread
double max = 0;
for (int j = 0; j < clusters.size(); j++) {
NumberVector ocentroid = centroids[j];
if (ocentroid == centroid) {
continue;
}
// Both are real clusters:
if (centroid != null && ocentroid != null) {
// bD = between group distance
double bD = distanceFunction.distance(centroid, ocentroid);
// d = within-to-between cluster spread
double d = (withinGroupDistancei + withinGroupDistance[j]) / bD;
max = d > max ? d : max;
} else if (noiseOption != NoiseHandling.IGNORE_NOISE) {
if (centroid != null) {
double d = Double.POSITIVE_INFINITY;
// Find the closest element
for (DBIDIter it = clusters.get(j).getIDs().iter(); it.valid(); it.advance()) {
double d2 = distanceFunction.distance(centroid, rel.get(it));
d = d2 < d ? d2 : d;
}
d = withinGroupDistancei / d;
max = d > max ? d : max;
} else if (ocentroid != null) {
double d = Double.POSITIVE_INFINITY;
// Find the closest element
for (DBIDIter it = clusters.get(i).getIDs().iter(); it.valid(); it.advance()) {
double d2 = distanceFunction.distance(rel.get(it), ocentroid);
d = d2 < d ? d2 : d;
}
d = withinGroupDistance[j] / d;
max = d > max ? d : max;
}
// else: (0+0) / d = 0.
}
}
daviesBouldin.put(max);
}
// For a single cluster, we return 2 (result for equidistant points)
final double daviesBouldinMean = daviesBouldin.getCount() > 1 ? daviesBouldin.getMean() : 2.;
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(key + ".db-index.noise-handling", noiseOption.toString()));
if (noisecount > 0) {
LOG.statistics(new LongStatistic(key + ".db-index.ignored", noisecount));
}
LOG.statistics(new DoubleStatistic(key + ".db-index", daviesBouldinMean));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("Davies Bouldin Index", daviesBouldinMean, 0., Double.POSITIVE_INFINITY, 0., true);
db.getHierarchy().resultChanged(ev);
return daviesBouldinMean;
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class EvaluateCIndex method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param c Clustering
* @return C-Index
*/
public double evaluateClustering(Database db, Relation<? extends O> rel, DistanceQuery<O> dq, Clustering<?> c) {
List<? extends Cluster<?>> clusters = c.getAllClusters();
// Count ignored noise, and within-cluster distances
int ignorednoise = 0, w = 0;
for (Cluster<?> cluster : clusters) {
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
ignorednoise += cluster.size();
// Ignore
continue;
case TREAT_NOISE_AS_SINGLETONS:
// No within-cluster distances!
continue;
case MERGE_NOISE:
// Treat like a cluster
break;
default:
LOG.warning("Unknown noise handling option: " + noiseOption);
}
}
w += (cluster.size() * (cluster.size() - 1)) >>> 1;
}
// TODO: for small k=2, and balanced clusters, it may be more efficient to
// just build a long array with all distances, and select the quantiles.
// The heaps used below pay off in memory consumption for k > 2
// Yes, maxDists is supposed to be a min heap, and the other way.
// Because we want to replace the smallest of the current k-largest
// distances.
DoubleHeap maxDists = new DoubleMinHeap(w);
DoubleHeap minDists = new DoubleMaxHeap(w);
// Sum of within-cluster distances
double theta = 0.;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Processing clusters for C-Index", clusters.size(), LOG) : null;
for (int i = 0; i < clusters.size(); i++) {
Cluster<?> cluster = clusters.get(i);
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
LOG.incrementProcessed(prog);
// Ignore
continue;
case TREAT_NOISE_AS_SINGLETONS:
processSingleton(cluster, rel, dq, maxDists, minDists, w);
LOG.incrementProcessed(prog);
continue;
case MERGE_NOISE:
// Treat like a cluster, below
break;
}
}
theta += processCluster(cluster, clusters, i, dq, maxDists, minDists, w);
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
// Simulate best and worst cases:
// Sum of largest and smallest
double min = 0, max = 0;
assert (minDists.size() == w);
assert (maxDists.size() == w);
for (DoubleHeap.UnsortedIter it = minDists.unsortedIter(); it.valid(); it.advance()) {
min += it.get();
}
for (DoubleHeap.UnsortedIter it = maxDists.unsortedIter(); it.valid(); it.advance()) {
max += it.get();
}
assert (max >= min);
double cIndex = (max > min) ? (theta - min) / (max - min) : 1.;
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(key + ".c-index.noise-handling", noiseOption.toString()));
if (ignorednoise > 0) {
LOG.statistics(new LongStatistic(key + ".c-index.ignored", ignorednoise));
}
LOG.statistics(new DoubleStatistic(key + ".c-index", cIndex));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("C-Index", cIndex, 0., 1., 0., true);
db.getHierarchy().resultChanged(ev);
return cIndex;
}
Aggregations