use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class RepresentativeUncertainClustering method run.
/**
* This run method will do the wrapping.
*
* Its called from {@link AbstractAlgorithm#run(Database)} and performs the
* call to the algorithms particular run method as well as the storing and
* comparison of the resulting Clusterings.
*
* @param database Database
* @param relation Data relation of uncertain objects
* @return Clustering result
*/
public Clustering<?> run(Database database, Relation<? extends UncertainObject> relation) {
ResultHierarchy hierarchy = database.getHierarchy();
ArrayList<Clustering<?>> clusterings = new ArrayList<>();
final int dim = RelationUtil.dimensionality(relation);
DBIDs ids = relation.getDBIDs();
// To collect samples
Result samples = new BasicResult("Samples", "samples");
// Step 1: Cluster sampled possible worlds:
Random rand = random.getSingleThreadedRandom();
FiniteProgress sampleP = LOG.isVerbose() ? new FiniteProgress("Clustering samples", numsamples, LOG) : null;
for (int i = 0; i < numsamples; i++) {
WritableDataStore<DoubleVector> store = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_DB, DoubleVector.class);
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
store.put(iter, relation.get(iter).drawSample(rand));
}
clusterings.add(runClusteringAlgorithm(hierarchy, samples, ids, store, dim, "Sample " + i));
LOG.incrementProcessed(sampleP);
}
LOG.ensureCompleted(sampleP);
// Step 2: perform the meta clustering (on samples only).
DBIDRange rids = DBIDFactory.FACTORY.generateStaticDBIDRange(clusterings.size());
WritableDataStore<Clustering<?>> datastore = DataStoreUtil.makeStorage(rids, DataStoreFactory.HINT_DB, Clustering.class);
{
Iterator<Clustering<?>> it2 = clusterings.iterator();
for (DBIDIter iter = rids.iter(); iter.valid(); iter.advance()) {
datastore.put(iter, it2.next());
}
}
assert (rids.size() == clusterings.size());
// Build a relation, and a distance matrix.
Relation<Clustering<?>> crel = new MaterializedRelation<Clustering<?>>(Clustering.TYPE, rids, "Clusterings", datastore);
PrecomputedDistanceMatrix<Clustering<?>> mat = new PrecomputedDistanceMatrix<>(crel, rids, distance);
mat.initialize();
ProxyDatabase d = new ProxyDatabase(rids, crel);
d.getHierarchy().add(crel, mat);
Clustering<?> c = metaAlgorithm.run(d);
// Detach from database
d.getHierarchy().remove(d, c);
// Evaluation
Result reps = new BasicResult("Representants", "representative");
hierarchy.add(relation, reps);
DistanceQuery<Clustering<?>> dq = mat.getDistanceQuery(distance);
List<? extends Cluster<?>> cl = c.getAllClusters();
List<DoubleObjPair<Clustering<?>>> evaluated = new ArrayList<>(cl.size());
for (Cluster<?> clus : cl) {
double besttau = Double.POSITIVE_INFINITY;
Clustering<?> bestc = null;
for (DBIDIter it1 = clus.getIDs().iter(); it1.valid(); it1.advance()) {
double tau = 0.;
Clustering<?> curc = crel.get(it1);
for (DBIDIter it2 = clus.getIDs().iter(); it2.valid(); it2.advance()) {
if (DBIDUtil.equal(it1, it2)) {
continue;
}
double di = dq.distance(curc, it2);
tau = di > tau ? di : tau;
}
// Cluster member with the least maximum distance.
if (tau < besttau) {
besttau = tau;
bestc = curc;
}
}
if (bestc == null) {
// E.g. degenerate empty clusters
continue;
}
// Global tau:
double gtau = 0.;
for (DBIDIter it2 = crel.iterDBIDs(); it2.valid(); it2.advance()) {
double di = dq.distance(bestc, it2);
gtau = di > gtau ? di : gtau;
}
final double cprob = computeConfidence(clus.size(), crel.size());
// Build an evaluation result
hierarchy.add(bestc, new RepresentativenessEvaluation(gtau, besttau, cprob));
evaluated.add(new DoubleObjPair<Clustering<?>>(cprob, bestc));
}
// Sort evaluated results by confidence:
Collections.sort(evaluated, Collections.reverseOrder());
for (DoubleObjPair<Clustering<?>> pair : evaluated) {
// Attach parent relation (= sample) to the representative samples.
for (It<Relation<?>> it = hierarchy.iterParents(pair.second).filter(Relation.class); it.valid(); it.advance()) {
hierarchy.add(reps, it.get());
}
}
// Add the random samples below the representative results only:
if (keep) {
hierarchy.add(relation, samples);
} else {
hierarchy.removeSubtree(samples);
}
return c;
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class ComputeKNNOutlierScores method run.
@Override
public void run() {
final Database database = inputstep.getDatabase();
final Relation<O> relation = database.getRelation(distf.getInputTypeRestriction());
// Ensure we don't go beyond the relation size:
final int maxk = Math.min(this.maxk, relation.size() - 1);
// Get a KNN query.
final int lim = Math.min(maxk + 2, relation.size());
KNNQuery<O> knnq = QueryUtil.getKNNQuery(relation, distf, lim);
// Precompute kNN:
if (!(knnq instanceof PreprocessorKNNQuery)) {
MaterializeKNNPreprocessor<O> preproc = new MaterializeKNNPreprocessor<>(relation, distf, lim);
preproc.initialize();
relation.getHierarchy().add(relation, preproc);
}
// Test that we now get a proper index query
knnq = QueryUtil.getKNNQuery(relation, distf, lim);
if (!(knnq instanceof PreprocessorKNNQuery)) {
throw new AbortException("Not using preprocessor knn query -- KNN queries using class: " + knnq.getClass());
}
// Warn for some known slow methods and large k:
if (!isDisabled("LDOF") && maxk > 100) {
LOG.verbose("Note: LODF needs O(k^2) distance computations. Use -" + Parameterizer.DISABLE_ID.getName() + " LDOF to disable.");
}
if (!isDisabled("FastABOD") && maxk > 100) {
LOG.warning("Note: FastABOD needs quadratic memory. Use -" + Parameterizer.DISABLE_ID.getName() + " FastABOD to disable.");
}
if (!isDisabled("DWOF") && maxk > 100) {
LOG.warning("Note: DWOF needs O(k^2) distance computations. Use -" + Parameterizer.DISABLE_ID.getName() + " DWOF to disable.");
}
final DBIDs ids = relation.getDBIDs();
try (PrintStream fout = new PrintStream(outfile)) {
// Control: print the DBIDs in case we are seeing an odd iteration
//
fout.append("# Data set size: " + relation.size()).append(" data type: " + relation.getDataTypeInformation()).append(FormatUtil.NEWLINE);
// Label outlier result (reference)
writeResult(fout, ids, bylabel.run(database), new IdentityScaling(), "bylabel");
final int startk = (this.startk > 0) ? this.startk : this.stepk;
final int startkmin2 = (startk >= 2) ? startk : (startk + stepk);
final int startkmin3 = (startk >= 3) ? startk : (startkmin2 >= 3) ? startkmin2 : (startkmin2 + stepk);
// Output function:
BiConsumer<String, OutlierResult> out = (kstr, result) -> writeResult(fout, ids, result, scaling, kstr);
// KNN
runForEachK(//
"KNN", //
startk, //
stepk, //
maxk, k -> //
new KNNOutlier<O>(distf, k).run(database, relation), out);
// KNN Weight
runForEachK(//
"KNNW", //
startk, //
stepk, //
maxk, k -> //
new KNNWeightOutlier<O>(distf, k).run(database, relation), out);
// Run LOF
runForEachK(//
"LOF", //
startk, //
stepk, //
maxk, k -> //
new LOF<O>(k, distf).run(database, relation), out);
// Run Simplified-LOF
runForEachK(//
"SimplifiedLOF", //
startk, //
stepk, //
maxk, k -> //
new SimplifiedLOF<O>(k, distf).run(database, relation), out);
// LoOP
runForEachK(//
"LoOP", //
startk, //
stepk, //
maxk, k -> //
new LoOP<O>(k, k, distf, distf, 1.0).run(database, relation), out);
// LDOF
runForEachK(//
"LDOF", //
startkmin2, //
stepk, //
maxk, k -> //
new LDOF<O>(distf, k).run(database, relation), out);
// Run ODIN
runForEachK(//
"ODIN", //
startk, //
stepk, //
maxk, k -> //
new ODIN<O>(distf, k).run(database, relation), out);
// Run FastABOD
runForEachK(//
"FastABOD", //
startkmin3, //
stepk, //
maxk, k -> //
new FastABOD<O>(new PolynomialKernelFunction(2), k).run(database, relation), out);
// Run KDEOS with intrinsic dimensionality 2.
runForEachK(//
"KDEOS", //
startkmin2, //
stepk, //
maxk, k -> new //
KDEOS<O>(//
distf, //
k, //
k, //
GaussianKernelDensityFunction.KERNEL, //
0., 0.5 * GaussianKernelDensityFunction.KERNEL.canonicalBandwidth(), //
2).run(database, relation), out);
// Run LDF
runForEachK(//
"LDF", //
startk, //
stepk, //
maxk, k -> //
new LDF<O>(k, distf, GaussianKernelDensityFunction.KERNEL, 1., .1).run(database, relation), out);
// Run INFLO
runForEachK(//
"INFLO", //
startk, //
stepk, //
maxk, k -> //
new INFLO<O>(distf, 1.0, k).run(database, relation), out);
// Run COF
runForEachK(//
"COF", //
startk, //
stepk, //
maxk, k -> //
new COF<O>(k, distf).run(database, relation), out);
// Run simple Intrinsic dimensionality
runForEachK(//
"Intrinsic", //
startkmin2, //
stepk, //
maxk, k -> //
new IntrinsicDimensionalityOutlier<O>(distf, k, AggregatedHillEstimator.STATIC).run(database, relation), out);
// Run IDOS
runForEachK(//
"IDOS", //
startkmin2, //
stepk, //
maxk, k -> //
new IDOS<O>(distf, AggregatedHillEstimator.STATIC, k, k).run(database, relation), out);
// Run simple kernel-density LOF variant
runForEachK(//
"KDLOF", //
startkmin2, //
stepk, //
maxk, k -> //
new SimpleKernelDensityLOF<O>(k, distf, GaussianKernelDensityFunction.KERNEL).run(database, relation), out);
// Run DWOF (need pairwise distances, too)
runForEachK(//
"DWOF", //
startkmin2, //
stepk, //
maxk, k -> //
new DWOF<O>(distf, k, 1.1).run(database, relation), out);
// Run LIC
runForEachK(//
"LIC", //
startk, //
stepk, //
maxk, k -> //
new LocalIsolationCoefficient<O>(distf, k).run(database, relation), out);
// Run VOV (requires a vector field).
if (TypeUtil.DOUBLE_VECTOR_FIELD.isAssignableFromType(relation.getDataTypeInformation())) {
@SuppressWarnings("unchecked") final DistanceFunction<? super DoubleVector> df = (DistanceFunction<? super DoubleVector>) distf;
@SuppressWarnings("unchecked") final Relation<DoubleVector> rel = (Relation<DoubleVector>) (Relation<?>) relation;
runForEachK(//
"VOV", //
startk, //
stepk, //
maxk, k -> //
new VarianceOfVolume<DoubleVector>(k, df).run(database, rel), out);
}
// Run KNN DD
runForEachK(//
"KNNDD", //
startk, //
stepk, //
maxk, k -> //
new KNNDD<O>(distf, k).run(database, relation), out);
// Run KNN SOS
runForEachK(//
"KNNSOS", //
startk, //
stepk, //
maxk, k -> //
new KNNSOS<O>(distf, k).run(relation), out);
// Run ISOS
runForEachK(//
"ISOS", //
startkmin2, //
stepk, //
maxk, k -> //
new ISOS<O>(distf, k, AggregatedHillEstimator.STATIC).run(relation), out);
} catch (FileNotFoundException e) {
throw new AbortException("Cannot create output file.", e);
}
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class GreedyEnsembleExperiment method applyPrescaling.
/**
* Prescale each vector (except when in {@code skip}) with the given scaling
* function.
*
* @param scaling Scaling function
* @param relation Relation to read
* @param skip DBIDs to pass unmodified
* @return New relation
*/
public static Relation<NumberVector> applyPrescaling(ScalingFunction scaling, Relation<NumberVector> relation, DBIDs skip) {
if (scaling == null) {
return relation;
}
NumberVector.Factory<NumberVector> factory = RelationUtil.getNumberVectorFactory(relation);
DBIDs ids = relation.getDBIDs();
WritableDataStore<NumberVector> contents = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT, NumberVector.class);
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
NumberVector v = relation.get(iter);
double[] raw = v.toArray();
if (!skip.contains(iter)) {
applyScaling(raw, scaling);
}
contents.put(iter, factory.newNumberVector(raw, ArrayLikeUtil.DOUBLEARRAYADAPTER));
}
return new MaterializedRelation<>(relation.getDataTypeInformation(), ids, "rescaled", contents);
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class EvaluateRetrievalPerformance method run.
/**
* Run the algorithm
*
* @param database Database to run on (for kNN queries)
* @param relation Relation for distance computations
* @param lrelation Relation for class label comparison
* @return Vectors containing mean and standard deviation.
*/
public RetrievalPerformanceResult run(Database database, Relation<O> relation, Relation<?> lrelation) {
final DistanceQuery<O> distQuery = database.getDistanceQuery(relation, getDistanceFunction());
final DBIDs ids = DBIDUtil.randomSample(relation.getDBIDs(), sampling, random);
// For storing the positive neighbors.
ModifiableDBIDs posn = DBIDUtil.newHashSet();
// Distance storage.
ModifiableDoubleDBIDList nlist = DBIDUtil.newDistanceDBIDList(relation.size());
// For counting labels seen in kNN
Object2IntOpenHashMap<Object> counters = new Object2IntOpenHashMap<>();
// Statistics tracking
double map = 0., mroc = 0.;
double[] knnperf = new double[maxk];
int samples = 0;
FiniteProgress objloop = LOG.isVerbose() ? new FiniteProgress("Processing query objects", ids.size(), LOG) : null;
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
Object label = lrelation.get(iter);
findMatches(posn, lrelation, label);
if (posn.size() > 0) {
computeDistances(nlist, iter, distQuery, relation);
if (nlist.size() != relation.size() - (includeSelf ? 0 : 1)) {
LOG.warning("Neighbor list does not have the desired size: " + nlist.size());
}
map += AveragePrecisionEvaluation.STATIC.evaluate(posn, nlist);
mroc += ROCEvaluation.STATIC.evaluate(posn, nlist);
KNNEvaluator.STATIC.evaluateKNN(knnperf, nlist, lrelation, counters, label);
samples += 1;
}
LOG.incrementProcessed(objloop);
}
LOG.ensureCompleted(objloop);
if (samples < 1) {
throw new AbortException("No object matched - are labels parsed correctly?");
}
if (!(map >= 0) || !(mroc >= 0)) {
throw new AbortException("NaN in MAP/ROC.");
}
map /= samples;
mroc /= samples;
LOG.statistics(new DoubleStatistic(PREFIX + ".map", map));
LOG.statistics(new DoubleStatistic(PREFIX + ".rocauc", mroc));
LOG.statistics(new DoubleStatistic(PREFIX + ".samples", samples));
for (int k = 0; k < maxk; k++) {
knnperf[k] = knnperf[k] / samples;
LOG.statistics(new DoubleStatistic(PREFIX + ".knn-" + (k + 1), knnperf[k]));
}
return new RetrievalPerformanceResult(samples, map, mroc, knnperf);
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class TextWriter method writeClusterResult.
private void writeClusterResult(Database db, StreamFactory streamOpener, Clustering<Model> clustering, Cluster<Model> clus, List<Relation<?>> ra, NamingScheme naming) throws FileNotFoundException, IOException {
String filename = null;
if (naming != null) {
filename = filenameFromLabel(naming.getNameFor(clus));
} else {
filename = "cluster";
}
PrintStream outStream = streamOpener.openStream(getFilename(clus, filename));
TextWriterStream out = new TextWriterStream(outStream, writers, fallback);
// Write cluster information
out.commentPrintLn("Cluster: " + naming.getNameFor(clus));
clus.writeToText(out, null);
if (clustering.getClusterHierarchy().numParents(clus) > 0) {
StringBuilder buf = new StringBuilder();
buf.append("Parents:");
for (It<Cluster<Model>> iter = clustering.getClusterHierarchy().iterParents(clus); iter.valid(); iter.advance()) {
buf.append(' ').append(naming.getNameFor(iter.get()));
}
out.commentPrintLn(buf.toString());
}
if (clustering.getClusterHierarchy().numChildren(clus) > 0) {
StringBuilder buf = new StringBuilder();
buf.append("Children:");
for (It<Cluster<Model>> iter = clustering.getClusterHierarchy().iterChildren(clus); iter.valid(); iter.advance()) {
buf.append(' ').append(naming.getNameFor(iter.get()));
}
out.commentPrintLn(buf.toString());
}
out.flush();
// print ids.
DBIDs ids = clus.getIDs();
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
printObject(out, db, iter, ra);
}
out.flush();
streamOpener.closeStream(outStream);
}
Aggregations