use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class EvaluatePBMIndex method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param c Clustering
* @return PBM
*/
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
List<? extends Cluster<?>> clusters = c.getAllClusters();
NumberVector[] centroids = new NumberVector[clusters.size()];
int ignorednoise = EvaluateSimplifiedSilhouette.centroids(rel, clusters, centroids, noiseHandling);
// Build global centroid and cluster count:
final int dim = RelationUtil.dimensionality(rel);
Centroid overallCentroid = new Centroid(dim);
EvaluateVarianceRatioCriteria.globalCentroid(overallCentroid, rel, clusters, centroids, noiseHandling);
// Maximum distance between centroids:
double max = 0;
for (int i = 0; i < centroids.length; i++) {
if (centroids[i] == null && noiseHandling != NoiseHandling.TREAT_NOISE_AS_SINGLETONS) {
continue;
}
for (int j = i + 1; j < centroids.length; j++) {
if (centroids[j] == null && noiseHandling != NoiseHandling.TREAT_NOISE_AS_SINGLETONS) {
continue;
}
if (centroids[i] == null && centroids[j] == null) {
// Need to compute pairwise distances of noise clusters.
for (DBIDIter iti = clusters.get(i).getIDs().iter(); iti.valid(); iti.advance()) {
for (DBIDIter itj = clusters.get(j).getIDs().iter(); itj.valid(); itj.advance()) {
double dist = distanceFunction.distance(rel.get(iti), rel.get(itj));
max = dist > max ? dist : max;
}
}
} else if (centroids[i] == null) {
for (DBIDIter iti = clusters.get(i).getIDs().iter(); iti.valid(); iti.advance()) {
double dist = distanceFunction.distance(rel.get(iti), centroids[j]);
max = dist > max ? dist : max;
}
} else if (centroids[j] == null) {
for (DBIDIter itj = clusters.get(j).getIDs().iter(); itj.valid(); itj.advance()) {
double dist = distanceFunction.distance(centroids[i], rel.get(itj));
max = dist > max ? dist : max;
}
} else {
double dist = distanceFunction.distance(centroids[i], centroids[j]);
max = dist > max ? dist : max;
}
}
}
// a: Distance to own centroid
// b: Distance to overall centroid
double a = 0, b = 0;
Iterator<? extends Cluster<?>> ci = clusters.iterator();
for (int i = 0; ci.hasNext(); i++) {
Cluster<?> cluster = ci.next();
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseHandling) {
case IGNORE_NOISE:
// Ignored
continue;
case TREAT_NOISE_AS_SINGLETONS:
// Singletons: a = 0 by definition.
for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
b += SquaredEuclideanDistanceFunction.STATIC.distance(overallCentroid, rel.get(it));
}
// with NEXT cluster.
continue;
case MERGE_NOISE:
// Treat like a cluster below:
break;
}
}
for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
NumberVector obj = rel.get(it);
a += distanceFunction.distance(centroids[i], obj);
b += distanceFunction.distance(overallCentroid, obj);
}
}
final double pbm = FastMath.pow((1. / centroids.length) * (b / a) * max, 2.);
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(key + ".pbm.noise-handling", noiseHandling.toString()));
if (ignorednoise > 0) {
LOG.statistics(new LongStatistic(key + ".pbm.ignored", ignorednoise));
}
LOG.statistics(new DoubleStatistic(key + ".pbm", pbm));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("PBM-Index", pbm, 0., Double.POSITIVE_INFINITY, 0., false);
db.getHierarchy().resultChanged(ev);
return pbm;
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class EvaluateSilhouette method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param dq Distance query
* @param c Clustering
* @return Average silhouette
*/
public double evaluateClustering(Database db, Relation<O> rel, DistanceQuery<O> dq, Clustering<?> c) {
List<? extends Cluster<?>> clusters = c.getAllClusters();
MeanVariance msil = new MeanVariance();
int ignorednoise = 0;
for (Cluster<?> cluster : clusters) {
// Note: we treat 1-element clusters the same as noise.
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
ignorednoise += cluster.size();
// Ignore noise elements
continue;
case TREAT_NOISE_AS_SINGLETONS:
// As suggested in Rousseeuw, we use 0 for singletons.
msil.put(0., cluster.size());
continue;
case MERGE_NOISE:
// Treat as cluster below
break;
}
}
ArrayDBIDs ids = DBIDUtil.ensureArray(cluster.getIDs());
// temporary storage.
double[] as = new double[ids.size()];
DBIDArrayIter it1 = ids.iter(), it2 = ids.iter();
for (it1.seek(0); it1.valid(); it1.advance()) {
// a: In-cluster distances
// Already computed distances
double a = as[it1.getOffset()];
for (it2.seek(it1.getOffset() + 1); it2.valid(); it2.advance()) {
final double dist = dq.distance(it1, it2);
a += dist;
as[it2.getOffset()] += dist;
}
a /= (ids.size() - 1);
// b: minimum average distance to other clusters:
double b = Double.POSITIVE_INFINITY;
for (Cluster<?> ocluster : clusters) {
if (ocluster == /* yes, reference identity */
cluster) {
// Same cluster
continue;
}
if (ocluster.size() <= 1 || ocluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
// Ignore noise elements
continue;
case TREAT_NOISE_AS_SINGLETONS:
// Treat noise cluster as singletons:
for (DBIDIter it3 = ocluster.getIDs().iter(); it3.valid(); it3.advance()) {
final double dist = dq.distance(it1, it3);
// Minimum average
b = dist < b ? dist : b;
}
continue;
case MERGE_NOISE:
// Treat as cluster below
break;
}
}
final DBIDs oids = ocluster.getIDs();
double btmp = 0.;
for (DBIDIter it3 = oids.iter(); it3.valid(); it3.advance()) {
btmp += dq.distance(it1, it3);
}
// Average
btmp /= oids.size();
// Minimum average
b = btmp < b ? btmp : b;
}
// One cluster only?
b = b < Double.POSITIVE_INFINITY ? b : a;
msil.put((b - a) / (b > a ? b : a));
}
}
double penalty = 1.;
// Only if {@link NoiseHandling#IGNORE_NOISE}:
if (penalize && ignorednoise > 0) {
penalty = (rel.size() - ignorednoise) / (double) rel.size();
}
final double meansil = penalty * msil.getMean();
final double stdsil = penalty * msil.getSampleStddev();
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(key + ".silhouette.noise-handling", noiseOption.toString()));
if (ignorednoise > 0) {
LOG.statistics(new LongStatistic(key + ".silhouette.noise", ignorednoise));
}
LOG.statistics(new DoubleStatistic(key + ".silhouette.mean", meansil));
LOG.statistics(new DoubleStatistic(key + ".silhouette.stddev", stdsil));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("Silhouette +-" + FormatUtil.NF2.format(stdsil), meansil, -1., 1., 0., false);
db.getHierarchy().resultChanged(ev);
return meansil;
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class EvaluateSimplifiedSilhouette method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param c Clustering
* @return Mean simplified silhouette
*/
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
List<? extends Cluster<?>> clusters = c.getAllClusters();
NumberVector[] centroids = new NumberVector[clusters.size()];
int ignorednoise = centroids(rel, clusters, centroids, noiseOption);
MeanVariance mssil = new MeanVariance();
Iterator<? extends Cluster<?>> ci = clusters.iterator();
for (int i = 0; ci.hasNext(); i++) {
Cluster<?> cluster = ci.next();
if (cluster.size() <= 1) {
// As suggested in Rousseeuw, we use 0 for singletons.
mssil.put(0., cluster.size());
continue;
}
if (cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
// Ignore elements
continue;
case TREAT_NOISE_AS_SINGLETONS:
// As suggested in Rousseeuw, we use 0 for singletons.
mssil.put(0., cluster.size());
continue;
case MERGE_NOISE:
// Treat as cluster below
break;
}
}
// Cluster center:
final NumberVector center = centroids[i];
assert (center != null);
for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
NumberVector obj = rel.get(it);
// a: Distance to own centroid
double a = distance.distance(center, obj);
// b: Distance to other clusters centroids:
double min = Double.POSITIVE_INFINITY;
Iterator<? extends Cluster<?>> cj = clusters.iterator();
for (int j = 0; cj.hasNext(); j++) {
Cluster<?> ocluster = cj.next();
if (i == j) {
continue;
}
NumberVector other = centroids[j];
if (other == null) {
// Noise!
switch(noiseOption) {
case IGNORE_NOISE:
continue;
case TREAT_NOISE_AS_SINGLETONS:
// Treat each object like a centroid!
for (DBIDIter it2 = ocluster.getIDs().iter(); it2.valid(); it2.advance()) {
double dist = distance.distance(rel.get(it2), obj);
min = dist < min ? dist : min;
}
continue;
case MERGE_NOISE:
// Treat as cluster below, but should not be reachable.
break;
}
}
// Clusters: use centroid.
double dist = distance.distance(other, obj);
min = dist < min ? dist : min;
}
// One 'real' cluster only?
min = min < Double.POSITIVE_INFINITY ? min : a;
mssil.put((min - a) / (min > a ? min : a));
}
}
double penalty = 1.;
// Only if {@link NoiseHandling#IGNORE_NOISE}:
if (penalize && ignorednoise > 0) {
penalty = (rel.size() - ignorednoise) / (double) rel.size();
}
final double meanssil = penalty * mssil.getMean();
final double stdssil = penalty * mssil.getSampleStddev();
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(key + ".simplified-silhouette.noise-handling", noiseOption.toString()));
if (ignorednoise > 0) {
LOG.statistics(new LongStatistic(key + ".simplified-silhouette.ignored", ignorednoise));
}
LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.mean", meanssil));
LOG.statistics(new DoubleStatistic(key + ".simplified-silhouette.stddev", stdssil));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("Simp. Silhouette +-" + FormatUtil.NF2.format(stdssil), meanssil, -1., 1., 0., false);
db.getHierarchy().resultChanged(ev);
return meanssil;
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class RandomProjectedNeighborsAndDensities method computeSetsBounds.
/**
* Create random projections, project points and put points into sets of size
* about minSplitSize/2
*
* @param points points to process
* @param minSplitSize minimum size for which a point set is further
* partitioned (roughly corresponds to minPts in OPTICS)
* @param ptList Points that are to be projected
*/
public void computeSetsBounds(Relation<V> points, int minSplitSize, DBIDs ptList) {
this.minSplitSize = minSplitSize;
final int size = points.size();
final int dim = RelationUtil.dimensionality(points);
this.points = points;
// perform O(log N+log dim) splits of the entire point sets projections
int nPointSetSplits = (int) (logOProjectionConst * MathUtil.log2(size * dim + 1));
// perform O(log N+log dim) projections of the point set onto a random line
int nProject1d = (int) (logOProjectionConst * MathUtil.log2(size * dim + 1));
LOG.statistics(new LongStatistic(PREFIX + ".partition-size", nPointSetSplits));
LOG.statistics(new LongStatistic(PREFIX + ".num-projections", nProject1d));
splitsets = new ArrayList<>();
// perform projections of points
projectedPoints = new DoubleDataStore[nProject1d];
DoubleDataStore[] tmpPro = new DoubleDataStore[nProject1d];
Random rand = rnd.getSingleThreadedRandom();
FiniteProgress projp = LOG.isVerbose() ? new FiniteProgress("Random projections", nProject1d, LOG) : null;
for (int j = 0; j < nProject1d; j++) {
double[] currRp = new double[dim];
double sum = 0;
for (int i = 0; i < dim; i++) {
double fl = rand.nextDouble() - 0.5;
currRp[i] = fl;
sum += fl * fl;
}
sum = FastMath.sqrt(sum);
for (int i = 0; i < dim; i++) {
currRp[i] /= sum;
}
WritableDoubleDataStore currPro = DataStoreUtil.makeDoubleStorage(ptList, DataStoreFactory.HINT_HOT);
for (DBIDIter it = ptList.iter(); it.valid(); it.advance()) {
NumberVector vecPt = points.get(it);
// Dot product:
double sum2 = 0;
for (int i = 0; i < dim; i++) {
sum2 += currRp[i] * vecPt.doubleValue(i);
}
currPro.put(it, sum2);
}
projectedPoints[j] = currPro;
LOG.incrementProcessed(projp);
}
LOG.ensureCompleted(projp);
// Log the number of scalar projections performed.
long numprod = nProject1d * (long) ptList.size();
LOG.statistics(new LongStatistic(PREFIX + ".num-scalar-products", numprod));
// split entire point set, reuse projections by shuffling them
IntArrayList proind = new IntArrayList(nProject1d);
for (int j = 0; j < nProject1d; j++) {
proind.add(j);
}
FiniteProgress splitp = LOG.isVerbose() ? new FiniteProgress("Splitting data", nPointSetSplits, LOG) : null;
for (int avgP = 0; avgP < nPointSetSplits; avgP++) {
// shuffle projections
for (int i = 0; i < nProject1d; i++) {
tmpPro[i] = projectedPoints[i];
}
// Shuffle axes (Fisher-Yates)
for (int i = 1; i < nProject1d; i++) {
final int j = rand.nextInt(i);
// Swap i,j
proind.set(i, proind.set(j, proind.getInt(i)));
}
IntIterator it = proind.iterator();
int i = 0;
while (it.hasNext()) {
int cind = it.nextInt();
projectedPoints[cind] = tmpPro[i];
i++;
}
// split point set
splitupNoSort(DBIDUtil.newArray(ptList), 0, size, 0, rand);
LOG.incrementProcessed(splitp);
}
LOG.ensureCompleted(splitp);
}
use of de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic in project elki by elki-project.
the class APRIORI method buildFrequentTwoItemsets.
/**
* Build the 2-itemsets.
*
* @param oneitems Frequent 1-itemsets
* @param relation Data relation
* @param dim Maximum dimensionality
* @param needed Minimum support needed
* @param ids Objects to process
* @param survivors Output: objects that had at least two 1-frequent items.
* @return Frequent 2-itemsets
*/
protected List<SparseItemset> buildFrequentTwoItemsets(List<OneItemset> oneitems, final Relation<BitVector> relation, final int dim, final int needed, DBIDs ids, ArrayModifiableDBIDs survivors) {
int f1 = 0;
long[] mask = BitsUtil.zero(dim);
for (OneItemset supported : oneitems) {
BitsUtil.setI(mask, supported.item);
f1++;
}
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(STAT + "2-items.candidates", f1 * (long) (f1 - 1)));
}
// We quite aggressively size the map, assuming that almost each combination
// is present somewhere. If this won't fit into memory, we're likely running
// OOM somewhere later anyway!
Long2IntOpenHashMap map = new Long2IntOpenHashMap((f1 * (f1 - 1)) >>> 1);
final long[] scratch = BitsUtil.zero(dim);
for (DBIDIter iditer = ids.iter(); iditer.valid(); iditer.advance()) {
BitsUtil.setI(scratch, mask);
relation.get(iditer).andOnto(scratch);
int lives = 0;
for (int i = BitsUtil.nextSetBit(scratch, 0); i >= 0; i = BitsUtil.nextSetBit(scratch, i + 1)) {
for (int j = BitsUtil.nextSetBit(scratch, i + 1); j >= 0; j = BitsUtil.nextSetBit(scratch, j + 1)) {
long key = (((long) i) << 32) | j;
map.put(key, 1 + map.get(key));
++lives;
}
}
if (lives > 2) {
survivors.add(iditer);
}
}
// Generate candidates of length 2.
List<SparseItemset> frequent = new ArrayList<>(f1 * (int) FastMath.sqrt(f1));
for (ObjectIterator<Long2IntMap.Entry> iter = map.long2IntEntrySet().fastIterator(); iter.hasNext(); ) {
Long2IntMap.Entry entry = iter.next();
if (entry.getIntValue() >= needed) {
int ii = (int) (entry.getLongKey() >>> 32);
int ij = (int) (entry.getLongKey() & -1L);
frequent.add(new SparseItemset(new int[] { ii, ij }, entry.getIntValue()));
}
}
// The hashmap may produce them out of order.
Collections.sort(frequent);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(STAT + "2-items.frequent", frequent.size()));
}
return frequent;
}
Aggregations