use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class EvaluateVarianceRatioCriteria method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param c Clustering
* @return Variance Ratio Criteria
*/
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
// FIXME: allow using a precomputed distance matrix!
final SquaredEuclideanDistanceFunction df = SquaredEuclideanDistanceFunction.STATIC;
List<? extends Cluster<?>> clusters = c.getAllClusters();
double vrc = 0.;
int ignorednoise = 0;
if (clusters.size() > 1) {
NumberVector[] centroids = new NumberVector[clusters.size()];
ignorednoise = EvaluateSimplifiedSilhouette.centroids(rel, clusters, centroids, noiseOption);
// Build global centroid and cluster count:
final int dim = RelationUtil.dimensionality(rel);
Centroid overallCentroid = new Centroid(dim);
int clustercount = globalCentroid(overallCentroid, rel, clusters, centroids, noiseOption);
// a: Distance to own centroid
// b: Distance to overall centroid
double a = 0, b = 0;
Iterator<? extends Cluster<?>> ci = clusters.iterator();
for (int i = 0; ci.hasNext(); i++) {
Cluster<?> cluster = ci.next();
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
// Ignored
continue;
case TREAT_NOISE_AS_SINGLETONS:
// Singletons: a = 0 by definition.
for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
b += df.distance(overallCentroid, rel.get(it));
}
// with NEXT cluster.
continue;
case MERGE_NOISE:
// Treat like a cluster below:
break;
}
}
for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
NumberVector vec = rel.get(it);
a += df.distance(centroids[i], vec);
b += df.distance(overallCentroid, vec);
}
}
vrc = ((b - a) / a) * ((rel.size() - clustercount) / (clustercount - 1.));
// Only if {@link NoiseHandling#IGNORE_NOISE}:
if (penalize && ignorednoise > 0) {
vrc *= (rel.size() - ignorednoise) / (double) rel.size();
}
}
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(key + ".vrc.noise-handling", noiseOption.toString()));
if (ignorednoise > 0) {
LOG.statistics(new LongStatistic(key + ".vrc.ignored", ignorednoise));
}
LOG.statistics(new DoubleStatistic(key + ".vrc", vrc));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("Variance Ratio Criteria", vrc, 0., 1., 0., false);
return vrc;
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class APRIORI method buildFrequentOneItemsets.
/**
* Build the 1-itemsets.
*
* @param relation Data relation
* @param dim Maximum dimensionality
* @param needed Minimum support needed
* @return 1-itemsets
*/
protected List<OneItemset> buildFrequentOneItemsets(final Relation<? extends SparseFeatureVector<?>> relation, final int dim, final int needed) {
// TODO: use TIntList and prefill appropriately to avoid knowing "dim"
// beforehand?
int[] counts = new int[dim];
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
SparseFeatureVector<?> bv = relation.get(iditer);
for (int it = bv.iter(); bv.iterValid(it); it = bv.iterAdvance(it)) {
counts[bv.iterDim(it)]++;
}
}
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(STAT + "1-items.candidates", dim));
}
// Generate initial candidates of length 1.
List<OneItemset> frequent = new ArrayList<>(dim);
for (int i = 0; i < dim; i++) {
if (counts[i] >= needed) {
frequent.add(new OneItemset(i, counts[i]));
}
}
return frequent;
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class EM method run.
/**
* Performs the EM clustering algorithm on the given database.
*
* Finally a hard clustering is provided where each clusters gets assigned the
* points exhibiting the highest probability to belong to this cluster. But
* still, the database objects hold associated the complete probability-vector
* for all models.
*
* @param database Database
* @param relation Relation
* @return Result
*/
public Clustering<M> run(Database database, Relation<V> relation) {
if (relation.size() == 0) {
throw new IllegalArgumentException("database empty: must contain elements");
}
// initial models
List<? extends EMClusterModel<M>> models = mfactory.buildInitialModels(database, relation, k, SquaredEuclideanDistanceFunction.STATIC);
WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
double loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
DoubleStatistic likestat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".loglikelihood") : null;
if (LOG.isStatistics()) {
LOG.statistics(likestat.setDouble(loglikelihood));
}
// iteration unless no change
int it = 0, lastimprovement = 0;
// For detecting instabilities.
double bestloglikelihood = loglikelihood;
for (++it; it < maxiter || maxiter < 0; it++) {
final double oldloglikelihood = loglikelihood;
recomputeCovarianceMatrices(relation, probClusterIGivenX, models, prior);
// reassign probabilities
loglikelihood = assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
if (LOG.isStatistics()) {
LOG.statistics(likestat.setDouble(loglikelihood));
}
if (loglikelihood - bestloglikelihood > delta) {
lastimprovement = it;
bestloglikelihood = loglikelihood;
}
if (Math.abs(loglikelihood - oldloglikelihood) <= delta || lastimprovement < it >> 1) {
break;
}
}
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", it));
}
// fill result with clusters and models
List<ModifiableDBIDs> hardClusters = new ArrayList<>(k);
for (int i = 0; i < k; i++) {
hardClusters.add(DBIDUtil.newArray());
}
// provide a hard clustering
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
hardClusters.get(argmax(probClusterIGivenX.get(iditer))).add(iditer);
}
Clustering<M> result = new Clustering<>("EM Clustering", "em-clustering");
// provide models within the result
for (int i = 0; i < k; i++) {
result.addToplevelCluster(new Cluster<>(hardClusters.get(i), models.get(i).finalizeCluster()));
}
if (isSoft()) {
result.addChildResult(new MaterializedRelation<>("cluster assignments", "em-soft-score", SOFT_TYPE, probClusterIGivenX, relation.getDBIDs()));
} else {
probClusterIGivenX.destroy();
}
return result;
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class KMedoidsEM method run.
/**
* Run k-medoids
*
* @param database Database
* @param relation relation to use
* @return result
*/
public Clustering<MedoidModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
}
DistanceQuery<V> distQ = null;
// Only enforce a distance matrix for PAM initialization, which is slow.
if (initializer instanceof PAMInitialMeans) {
distQ = DatabaseUtil.precomputedDistanceQuery(database, relation, getDistanceFunction(), LOG);
} else {
distQ = database.getDistanceQuery(relation, getDistanceFunction());
}
// Choose initial medoids
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
ArrayModifiableDBIDs medoids = DBIDUtil.newArray(initializer.chooseInitialMedoids(k, relation.getDBIDs(), distQ));
DBIDArrayMIter miter = medoids.iter();
double[] mdists = new double[k];
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
HashSetModifiableDBIDs set = DBIDUtil.newHashSet(relation.size() / k);
// Add medoids.
set.add(miter.seek(i));
clusters.add(set);
}
// Initial assignment to nearest medoids
// TODO: reuse this information, from the build phase, when possible?
double tc = assignToNearestCluster(miter, mdists, clusters, distQ);
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + 0 + ".cost", tc));
}
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medoids EM iteration", LOG) : null;
// Swap phase
int iteration = 0;
DBIDVar best = DBIDUtil.newVar();
while (true) {
boolean changed = false;
// Try to swap the medoid with a better cluster member:
int i = 0;
for (miter.seek(0); miter.valid(); miter.advance(), i++) {
best.unset();
double bestm = mdists[i];
for (DBIDIter iter = clusters.get(i).iter(); iter.valid(); iter.advance()) {
if (DBIDUtil.equal(miter, iter)) {
continue;
}
double sum = 0;
for (DBIDIter iter2 = clusters.get(i).iter(); iter2.valid(); iter2.advance()) {
sum += distQ.distance(iter, iter2);
}
if (sum < bestm) {
best.set(iter);
bestm = sum;
}
}
if (best.isSet() && !DBIDUtil.equal(miter, best)) {
changed = true;
assert (clusters.get(i).contains(best));
medoids.set(i, best);
mdists[i] = bestm;
}
}
// Reassign
if (!changed) {
break;
}
double nc = assignToNearestCluster(miter, mdists, clusters, distQ);
++iteration;
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(KEY + ".iteration-" + iteration + ".cost", nc));
}
LOG.incrementProcessed(prog);
}
LOG.setCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
}
// Wrap result
Clustering<MedoidModel> result = new Clustering<>("k-Medoids Clustering", "kmedoids-clustering");
for (DBIDArrayIter it = medoids.iter(); it.valid(); it.advance()) {
result.addToplevelCluster(new Cluster<>(clusters.get(it.getOffset()), new MedoidModel(DBIDUtil.deref(it))));
}
return result;
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class AbstractMTree method logStatistics.
@Override
public void logStatistics() {
super.logStatistics();
Logging log = getLogger();
if (log.isStatistics()) {
log.statistics(new LongStatistic(this.getClass().getName() + ".height", getHeight()));
statistics.logStatistics();
}
}
Aggregations