use of de.lmu.ifi.dbs.elki.data.model.MeanModel in project elki by elki-project.
the class KMeansBisectingTest method testKMeansBisectingFMeasure.
/**
* Run KMeansBisecting with fixed parameters (k = 2) and compare f-measure to
* golden standard.
*/
@Test
public void testKMeansBisectingFMeasure() {
Database db = makeSimpleDatabase(UNITTEST + "bisecting-test.csv", 300);
KMeansBisecting<DoubleVector, MeanModel> kmeans = //
new ELKIBuilder<KMeansBisecting<DoubleVector, MeanModel>>(KMeansBisecting.class).with(KMeans.K_ID, //
2).with(KMeans.SEED_ID, //
0).with(BestOfMultipleKMeans.Parameterizer.TRIALS_ID, //
5).with(BestOfMultipleKMeans.Parameterizer.KMEANS_ID, //
KMeansLloyd.class).with(BestOfMultipleKMeans.Parameterizer.QUALITYMEASURE_ID, //
WithinClusterVarianceQualityMeasure.class).build();
// run KMedians on database
Clustering<MeanModel> result = kmeans.run(db);
testFMeasure(db, result, 0.7408);
}
use of de.lmu.ifi.dbs.elki.data.model.MeanModel in project elki by elki-project.
the class BIRCHLeafClustering method run.
/**
* Run the clustering algorithm.
*
* @param relation Input data
* @return Clustering
*/
public Clustering<MeanModel> run(Relation<NumberVector> relation) {
final int dim = RelationUtil.dimensionality(relation);
CFTree tree = cffactory.newTree(relation.getDBIDs(), relation);
// The CFTree does not store points. We have to reassign them (and the
// quality is better than if we used the initial assignment, because centers
// move in particular in the beginning, so we always had many outliers.
Map<ClusteringFeature, ModifiableDBIDs> idmap = new HashMap<ClusteringFeature, ModifiableDBIDs>(tree.leaves);
for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
ClusteringFeature cf = tree.findLeaf(relation.get(iter));
ModifiableDBIDs ids = idmap.get(cf);
if (ids == null) {
idmap.put(cf, ids = DBIDUtil.newArray(cf.n));
}
ids.add(iter);
}
Clustering<MeanModel> result = new Clustering<>("BIRCH-leaves", "BIRCH leaves");
for (Map.Entry<ClusteringFeature, ModifiableDBIDs> ent : idmap.entrySet()) {
ClusteringFeature leaf = ent.getKey();
double[] center = new double[dim];
for (int i = 0; i < dim; i++) {
center[i] = leaf.centroid(i);
}
result.addToplevelCluster(new Cluster<>(ent.getValue(), new MeanModel(center)));
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.model.MeanModel in project elki by elki-project.
the class KMediansLloyd method run.
@Override
public Clustering<MeanModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("k-Medians Clustering", "kmedians-clustering");
}
// Choose initial medians
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
double[][] medians = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
double[] distsum = new double[k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Medians iteration", LOG) : null;
int iteration = 0;
for (; maxiter <= 0 || iteration < maxiter; iteration++) {
LOG.incrementProcessed(prog);
boolean changed = assignToNearestCluster(relation, medians, clusters, assignment, distsum);
// Stop if no cluster assignment changed.
if (!changed) {
break;
}
// Recompute medians.
medians = medians(clusters, medians, relation);
}
LOG.setCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
}
// Wrap result
Clustering<MeanModel> result = new Clustering<>("k-Medians Clustering", "kmedians-clustering");
for (int i = 0; i < clusters.size(); i++) {
MeanModel model = new MeanModel(medians[i]);
result.addToplevelCluster(new Cluster<>(clusters.get(i), model));
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.model.MeanModel in project elki by elki-project.
the class CBLOF method computeCBLOFs.
/**
* Compute the CBLOF scores for all the data.
*
* @param relation Data to process
* @param distance The distance function
* @param cblofs CBLOF scores
* @param cblofMinMax Minimum/maximum score tracker
* @param largeClusters Large clusters output
* @param smallClusters Small clusters output
*/
private void computeCBLOFs(Relation<O> relation, NumberVectorDistanceFunction<? super O> distance, WritableDoubleDataStore cblofs, DoubleMinMax cblofMinMax, List<? extends Cluster<MeanModel>> largeClusters, List<? extends Cluster<MeanModel>> smallClusters) {
List<NumberVector> largeClusterMeans = new ArrayList<>(largeClusters.size());
for (Cluster<MeanModel> largeCluster : largeClusters) {
NumberVector mean = ModelUtil.getPrototypeOrCentroid(largeCluster.getModel(), relation, largeCluster.getIDs());
largeClusterMeans.add(mean);
// Compute CBLOF scores for members of large clusters
for (DBIDIter iter = largeCluster.getIDs().iter(); iter.valid(); iter.advance()) {
double cblof = computeLargeClusterCBLOF(relation.get(iter), distance, mean, largeCluster);
storeCBLOFScore(cblofs, cblofMinMax, cblof, iter);
}
}
for (Cluster<MeanModel> smallCluster : smallClusters) {
for (DBIDIter iter = smallCluster.getIDs().iter(); iter.valid(); iter.advance()) {
double cblof = computeSmallClusterCBLOF(relation.get(iter), distance, largeClusterMeans, smallCluster);
storeCBLOFScore(cblofs, cblofMinMax, cblof, iter);
}
}
}
use of de.lmu.ifi.dbs.elki.data.model.MeanModel in project elki by elki-project.
the class CBLOF method run.
/**
* Runs the CBLOF algorithm on the given database.
*
* @param database Database to query
* @param relation Data to process
* @return CBLOF outlier result
*/
public OutlierResult run(Database database, Relation<O> relation) {
StepProgress stepprog = LOG.isVerbose() ? new StepProgress("CBLOF", 3) : null;
DBIDs ids = relation.getDBIDs();
LOG.beginStep(stepprog, 1, "Computing clustering.");
Clustering<MeanModel> clustering = clusteringAlgorithm.run(database);
LOG.beginStep(stepprog, 2, "Computing boundary between large and small clusters.");
List<? extends Cluster<MeanModel>> clusters = clustering.getAllClusters();
Collections.sort(clusters, new Comparator<Cluster<MeanModel>>() {
@Override
public int compare(Cluster<MeanModel> o1, Cluster<MeanModel> o2) {
// Sort in descending order by size
return Integer.compare(o2.size(), o1.size());
}
});
int clusterBoundary = getClusterBoundary(relation, clusters);
List<? extends Cluster<MeanModel>> largeClusters = clusters.subList(0, clusterBoundary + 1);
List<? extends Cluster<MeanModel>> smallClusters = clusters.subList(clusterBoundary + 1, clusters.size());
LOG.beginStep(stepprog, 3, "Computing Cluster-Based Local Outlier Factors (CBLOF).");
WritableDoubleDataStore cblofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_DB);
DoubleMinMax cblofMinMax = new DoubleMinMax();
computeCBLOFs(relation, distance, cblofs, cblofMinMax, largeClusters, smallClusters);
LOG.setCompleted(stepprog);
DoubleRelation scoreResult = new MaterializedDoubleRelation("Cluster-Based Local Outlier Factor", "cblof-outlier", cblofs, ids);
OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(cblofMinMax.getMin(), cblofMinMax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0);
return new OutlierResult(scoreMeta, scoreResult);
}
Aggregations