use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class CASH method buildDerivatorDB.
/**
* Builds a database for the derivator consisting of the ids in the specified
* interval.
*
* @param relation the database storing the parameterization functions
* @param interval the interval to build the database from
* @return a database for the derivator consisting of the ids in the specified
* interval
*/
private Database buildDerivatorDB(Relation<ParameterizationFunction> relation, CASHInterval interval) {
DBIDs ids = interval.getIDs();
ProxyDatabase proxy = new ProxyDatabase(ids);
int dim = dimensionality(relation);
SimpleTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, dim);
WritableDataStore<DoubleVector> prep = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT, DoubleVector.class);
// Project
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
prep.put(iter, DoubleVector.wrap(relation.get(iter).getColumnVector()));
}
if (LOG.isDebugging()) {
LOG.debugFine("db fuer derivator : " + ids.size());
}
MaterializedRelation<DoubleVector> prel = new MaterializedRelation<>(type, ids, null, prep);
proxy.addRelation(prel);
return proxy;
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class SimpleCOP method run.
public OutlierResult run(Database database, Relation<V> data) throws IllegalStateException {
KNNQuery<V> knnQuery = QueryUtil.getKNNQuery(data, getDistanceFunction(), k + 1);
DBIDs ids = data.getDBIDs();
WritableDoubleDataStore cop_score = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
WritableDataStore<double[]> cop_err_v = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, double[].class);
WritableDataStore<double[][]> cop_datav = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, double[][].class);
WritableIntegerDataStore cop_dim = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, -1);
WritableDataStore<CorrelationAnalysisSolution<?>> cop_sol = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, CorrelationAnalysisSolution.class);
{
// compute neighbors of each db object
FiniteProgress progressLocalPCA = LOG.isVerbose() ? new FiniteProgress("Correlation Outlier Probabilities", data.size(), LOG) : null;
double sqrt2 = MathUtil.SQRT2;
for (DBIDIter id = data.iterDBIDs(); id.valid(); id.advance()) {
KNNList neighbors = knnQuery.getKNNForDBID(id, k + 1);
ModifiableDBIDs nids = DBIDUtil.newArray(neighbors);
nids.remove(id);
// TODO: do we want to use the query point as centroid?
CorrelationAnalysisSolution<V> depsol = dependencyDerivator.generateModel(data, nids);
double stddev = depsol.getStandardDeviation();
double distance = depsol.distance(data.get(id));
double prob = NormalDistribution.erf(distance / (stddev * sqrt2));
cop_score.putDouble(id, prob);
cop_err_v.put(id, times(depsol.errorVector(data.get(id)), -1));
double[][] datav = depsol.dataProjections(data.get(id));
cop_datav.put(id, datav);
cop_dim.putInt(id, depsol.getCorrelationDimensionality());
cop_sol.put(id, depsol);
LOG.incrementProcessed(progressLocalPCA);
}
LOG.ensureCompleted(progressLocalPCA);
}
// combine results.
DoubleRelation scoreResult = new MaterializedDoubleRelation("Original Correlation Outlier Probabilities", "origcop-outlier", cop_score, ids);
OutlierScoreMeta scoreMeta = new ProbabilisticOutlierScore();
OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
// extra results
result.addChildResult(new MaterializedRelation<>("Local Dimensionality", COP.COP_DIM, TypeUtil.INTEGER, cop_dim, ids));
result.addChildResult(new MaterializedRelation<>("Error vectors", COP.COP_ERRORVEC, TypeUtil.DOUBLE_ARRAY, cop_err_v, ids));
result.addChildResult(new MaterializedRelation<>("Data vectors", "cop-datavec", TypeUtil.MATRIX, cop_datav, ids));
result.addChildResult(new MaterializedRelation<>("Correlation analysis", "cop-sol", new SimpleTypeInformation<CorrelationAnalysisSolution<?>>(CorrelationAnalysisSolution.class), cop_sol, ids));
return result;
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class GreedyEnsembleExperiment method run.
@Override
public void run() {
// Note: the database contains the *result vectors*, not the original data.
final Database database = inputstep.getDatabase();
Relation<NumberVector> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD);
final Relation<String> labels = DatabaseUtil.guessLabelRepresentation(database);
final DBID firstid = DBIDUtil.deref(labels.iterDBIDs());
final String firstlabel = labels.get(firstid);
if (!firstlabel.matches("bylabel")) {
throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!");
}
relation = applyPrescaling(prescaling, relation, firstid);
final int numcand = relation.size() - 1;
// Dimensionality and reference vector
final int dim = RelationUtil.dimensionality(relation);
final NumberVector refvec = relation.get(firstid);
// Build the positive index set for ROC AUC.
VectorNonZero positive = new VectorNonZero(refvec);
final int desired_outliers = (int) (rate * dim);
int union_outliers = 0;
final int[] outliers_seen = new int[dim];
// Merge the top-k for each ensemble member, until we have enough
// candidates.
{
int k = 0;
ArrayList<DecreasingVectorIter> iters = new ArrayList<>(numcand);
if (minvote >= numcand) {
minvote = Math.max(1, numcand - 1);
}
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
// Skip "by label", obviously
if (DBIDUtil.equal(firstid, iditer)) {
continue;
}
iters.add(new DecreasingVectorIter(relation.get(iditer)));
}
loop: while (union_outliers < desired_outliers) {
for (DecreasingVectorIter iter : iters) {
if (!iter.valid()) {
LOG.warning("Union_outliers=" + union_outliers + " < desired_outliers=" + desired_outliers + " minvote=" + minvote);
break loop;
}
int cur = iter.dim();
outliers_seen[cur] += 1;
if (outliers_seen[cur] == minvote) {
union_outliers += 1;
}
iter.advance();
}
k++;
}
LOG.verbose("Merged top " + k + " outliers to: " + union_outliers + " outliers (desired: at least " + desired_outliers + ")");
}
// Build the final weight vector.
final double[] estimated_weights = new double[dim];
final double[] estimated_truth = new double[dim];
updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
DoubleVector estimated_truth_vec = DoubleVector.wrap(estimated_truth);
PrimitiveDistanceFunction<NumberVector> wdist = getDistanceFunction(estimated_weights);
PrimitiveDistanceFunction<NumberVector> tdist = wdist;
// Build the naive ensemble:
final double[] naiveensemble = new double[dim];
{
double[] buf = new double[numcand];
for (int d = 0; d < dim; d++) {
int i = 0;
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
if (DBIDUtil.equal(firstid, iditer)) {
continue;
}
final NumberVector vec = relation.get(iditer);
buf[i] = vec.doubleValue(d);
i++;
}
naiveensemble[d] = voting.combine(buf, i);
if (Double.isNaN(naiveensemble[d])) {
LOG.warning("NaN after combining: " + FormatUtil.format(buf) + " i=" + i + " " + voting.toString());
}
}
}
DoubleVector naivevec = DoubleVector.wrap(naiveensemble);
// Compute single AUC scores and estimations.
// Remember the method most similar to the estimation
double bestauc = 0.0;
String bestaucstr = "";
double bestcost = Double.POSITIVE_INFINITY;
String bestcoststr = "";
DBID bestid = null;
double bestest = Double.POSITIVE_INFINITY;
{
final double[] greedyensemble = new double[dim];
// Compute individual scores
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
if (DBIDUtil.equal(firstid, iditer)) {
continue;
}
// fout.append(labels.get(id));
final NumberVector vec = relation.get(iditer);
singleEnsemble(greedyensemble, vec);
double auc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(DoubleVector.wrap(greedyensemble)));
double estimated = wdist.distance(DoubleVector.wrap(greedyensemble), estimated_truth_vec);
double cost = tdist.distance(DoubleVector.wrap(greedyensemble), refvec);
LOG.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(iditer));
if (auc > bestauc) {
bestauc = auc;
bestaucstr = labels.get(iditer);
}
if (cost < bestcost) {
bestcost = cost;
bestcoststr = labels.get(iditer);
}
if (estimated < bestest || bestid == null) {
bestest = estimated;
bestid = DBIDUtil.deref(iditer);
}
}
}
// Initialize ensemble with "best" method
if (prescaling != null) {
LOG.verbose("Input prescaling: " + prescaling);
}
LOG.verbose("Distance function: " + wdist);
LOG.verbose("Ensemble voting: " + voting);
if (scaling != null) {
LOG.verbose("Ensemble rescaling: " + scaling);
}
LOG.verbose("Initial estimation of outliers: " + union_outliers);
LOG.verbose("Initializing ensemble with: " + labels.get(bestid));
ModifiableDBIDs ensemble = DBIDUtil.newArray(bestid);
ModifiableDBIDs enscands = DBIDUtil.newHashSet(relation.getDBIDs());
ModifiableDBIDs dropped = DBIDUtil.newHashSet(relation.size());
dropped.add(firstid);
enscands.remove(bestid);
enscands.remove(firstid);
final double[] greedyensemble = new double[dim];
singleEnsemble(greedyensemble, relation.get(bestid));
// Greedily grow the ensemble
final double[] testensemble = new double[dim];
while (enscands.size() > 0) {
NumberVector greedyvec = DoubleVector.wrap(greedyensemble);
final double oldd = wdist.distance(estimated_truth_vec, greedyvec);
final int heapsize = enscands.size();
ModifiableDoubleDBIDList heap = DBIDUtil.newDistanceDBIDList(heapsize);
double[] tmp = new double[dim];
for (DBIDIter iter = enscands.iter(); iter.valid(); iter.advance()) {
final NumberVector vec = relation.get(iter);
singleEnsemble(tmp, vec);
double diversity = wdist.distance(DoubleVector.wrap(greedyensemble), greedyvec);
heap.add(diversity, iter);
}
heap.sort();
for (DoubleDBIDListMIter it = heap.iter(); heap.size() > 0; it.remove()) {
// Last
it.seek(heap.size() - 1);
enscands.remove(it);
final NumberVector vec = relation.get(it);
// Build combined ensemble.
{
double[] buf = new double[ensemble.size() + 1];
for (int i = 0; i < dim; i++) {
int j = 0;
for (DBIDIter iter = ensemble.iter(); iter.valid(); iter.advance()) {
buf[j] = relation.get(iter).doubleValue(i);
j++;
}
buf[j] = vec.doubleValue(i);
testensemble[i] = voting.combine(buf, j + 1);
}
}
applyScaling(testensemble, scaling);
NumberVector testvec = DoubleVector.wrap(testensemble);
double newd = wdist.distance(estimated_truth_vec, testvec);
// labels.get(bestadd));
if (newd < oldd) {
System.arraycopy(testensemble, 0, greedyensemble, 0, dim);
ensemble.add(it);
// Recompute heap
break;
} else {
dropped.add(it);
// logger.verbose("Discarding: " + labels.get(bestadd));
if (refine_truth) {
// Update target vectors and weights
ArrayList<DecreasingVectorIter> iters = new ArrayList<>(numcand);
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
// Skip "by label", obviously
if (DBIDUtil.equal(firstid, iditer) || dropped.contains(iditer)) {
continue;
}
iters.add(new DecreasingVectorIter(relation.get(iditer)));
}
if (minvote >= iters.size()) {
minvote = iters.size() - 1;
}
union_outliers = 0;
Arrays.fill(outliers_seen, 0);
while (union_outliers < desired_outliers) {
for (DecreasingVectorIter iter : iters) {
if (!iter.valid()) {
break;
}
int cur = iter.dim();
if (outliers_seen[cur] == 0) {
outliers_seen[cur] = 1;
} else {
outliers_seen[cur] += 1;
}
if (outliers_seen[cur] == minvote) {
union_outliers += 1;
}
iter.advance();
}
}
LOG.warning("New num outliers: " + union_outliers);
updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
estimated_truth_vec = DoubleVector.wrap(estimated_truth);
}
}
}
}
// Build the improved ensemble:
StringBuilder greedylbl = new StringBuilder();
{
for (DBIDIter iter = ensemble.iter(); iter.valid(); iter.advance()) {
if (greedylbl.length() > 0) {
greedylbl.append(' ');
}
greedylbl.append(labels.get(iter));
}
}
DoubleVector greedyvec = DoubleVector.wrap(greedyensemble);
if (refine_truth) {
LOG.verbose("Estimated outliers remaining: " + union_outliers);
}
LOG.verbose("Greedy ensemble (" + ensemble.size() + "): " + greedylbl.toString());
LOG.verbose("Best single ROC AUC: " + bestauc + " (" + bestaucstr + ")");
LOG.verbose("Best single cost: " + bestcost + " (" + bestcoststr + ")");
// Evaluate the naive ensemble and the "shrunk" ensemble
double naiveauc, naivecost;
{
naiveauc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(naivevec));
naivecost = tdist.distance(naivevec, refvec);
LOG.verbose("Naive ensemble AUC: " + naiveauc + " cost: " + naivecost);
LOG.verbose("Naive ensemble Gain: " + gain(naiveauc, bestauc, 1) + " cost gain: " + gain(naivecost, bestcost, 0));
}
double greedyauc, greedycost;
{
greedyauc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(greedyvec));
greedycost = tdist.distance(greedyvec, refvec);
LOG.verbose("Greedy ensemble AUC: " + greedyauc + " cost: " + greedycost);
LOG.verbose("Greedy ensemble Gain to best: " + gain(greedyauc, bestauc, 1) + " cost gain: " + gain(greedycost, bestcost, 0));
LOG.verbose("Greedy ensemble Gain to naive: " + gain(greedyauc, naiveauc, 1) + " cost gain: " + gain(greedycost, naivecost, 0));
}
{
MeanVariance meanauc = new MeanVariance();
MeanVariance meancost = new MeanVariance();
HashSetModifiableDBIDs candidates = DBIDUtil.newHashSet(relation.getDBIDs());
candidates.remove(firstid);
for (int i = 0; i < 1000; i++) {
// Build the improved ensemble:
final double[] randomensemble = new double[dim];
{
DBIDs random = DBIDUtil.randomSample(candidates, ensemble.size(), (long) i);
double[] buf = new double[random.size()];
for (int d = 0; d < dim; d++) {
int j = 0;
for (DBIDIter iter = random.iter(); iter.valid(); iter.advance()) {
assert (!DBIDUtil.equal(firstid, iter));
final NumberVector vec = relation.get(iter);
buf[j] = vec.doubleValue(d);
j++;
}
randomensemble[d] = voting.combine(buf, j);
}
}
applyScaling(randomensemble, scaling);
NumberVector randomvec = DoubleVector.wrap(randomensemble);
double auc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(randomvec));
meanauc.put(auc);
double cost = tdist.distance(randomvec, refvec);
meancost.put(cost);
}
LOG.verbose("Random ensemble AUC: " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev()));
LOG.verbose("Random ensemble Gain: " + gain(meanauc.getMean(), bestauc, 1));
LOG.verbose("Greedy improvement: " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations.");
LOG.verbose("Random ensemble Cost: " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev()));
LOG.verbose("Random ensemble Gain: " + gain(meancost.getMean(), bestcost, 0));
LOG.verbose("Greedy improvement: " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations.");
LOG.verbose("Naive ensemble Gain to random: " + gain(naiveauc, meanauc.getMean(), 1) + " cost gain: " + gain(naivecost, meancost.getMean(), 0));
LOG.verbose("Random ensemble Gain to naive: " + gain(meanauc.getMean(), naiveauc, 1) + " cost gain: " + gain(meancost.getMean(), naivecost, 0));
LOG.verbose("Greedy ensemble Gain to random: " + gain(greedyauc, meanauc.getMean(), 1) + " cost gain: " + gain(greedycost, meancost.getMean(), 0));
}
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class OPTICSOF method run.
/**
* Perform OPTICS-based outlier detection.
*
* @param database Database
* @param relation Relation
* @return Outlier detection result
*/
public OutlierResult run(Database database, Relation<O> relation) {
DistanceQuery<O> distQuery = database.getDistanceQuery(relation, getDistanceFunction());
KNNQuery<O> knnQuery = database.getKNNQuery(distQuery, minpts);
RangeQuery<O> rangeQuery = database.getRangeQuery(distQuery);
DBIDs ids = relation.getDBIDs();
// FIXME: implicit preprocessor.
WritableDataStore<KNNList> nMinPts = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, KNNList.class);
WritableDoubleDataStore coreDistance = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
WritableIntegerDataStore minPtsNeighborhoodSize = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, -1);
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
KNNList minptsNeighbours = knnQuery.getKNNForDBID(iditer, minpts);
double d = minptsNeighbours.getKNNDistance();
nMinPts.put(iditer, minptsNeighbours);
coreDistance.putDouble(iditer, d);
minPtsNeighborhoodSize.put(iditer, rangeQuery.getRangeForDBID(iditer, d).size());
}
// Pass 2
WritableDataStore<List<Double>> reachDistance = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, List.class);
WritableDoubleDataStore lrds = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP);
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
List<Double> core = new ArrayList<>();
double lrd = 0;
// TODO: optimize for double distances
for (DoubleDBIDListIter neighbor = nMinPts.get(iditer).iter(); neighbor.valid(); neighbor.advance()) {
double coreDist = coreDistance.doubleValue(neighbor);
double dist = distQuery.distance(iditer, neighbor);
double rd = MathUtil.max(coreDist, dist);
lrd = rd + lrd;
core.add(rd);
}
lrd = minPtsNeighborhoodSize.intValue(iditer) / lrd;
reachDistance.put(iditer, core);
lrds.putDouble(iditer, lrd);
}
// Pass 3
DoubleMinMax ofminmax = new DoubleMinMax();
WritableDoubleDataStore ofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_STATIC);
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
double of = 0;
for (DBIDIter neighbor = nMinPts.get(iditer).iter(); neighbor.valid(); neighbor.advance()) {
double lrd = lrds.doubleValue(iditer);
double lrdN = lrds.doubleValue(neighbor);
of = of + lrdN / lrd;
}
of = of / minPtsNeighborhoodSize.intValue(iditer);
ofs.putDouble(iditer, of);
// update minimum and maximum
ofminmax.put(of);
}
// Build result representation.
DoubleRelation scoreResult = new MaterializedDoubleRelation("OPTICS Outlier Scores", "optics-outlier", ofs, relation.getDBIDs());
OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(ofminmax.getMin(), ofminmax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0);
return new OutlierResult(scoreMeta, scoreResult);
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class CBLOF method run.
/**
* Runs the CBLOF algorithm on the given database.
*
* @param database Database to query
* @param relation Data to process
* @return CBLOF outlier result
*/
public OutlierResult run(Database database, Relation<O> relation) {
StepProgress stepprog = LOG.isVerbose() ? new StepProgress("CBLOF", 3) : null;
DBIDs ids = relation.getDBIDs();
LOG.beginStep(stepprog, 1, "Computing clustering.");
Clustering<MeanModel> clustering = clusteringAlgorithm.run(database);
LOG.beginStep(stepprog, 2, "Computing boundary between large and small clusters.");
List<? extends Cluster<MeanModel>> clusters = clustering.getAllClusters();
Collections.sort(clusters, new Comparator<Cluster<MeanModel>>() {
@Override
public int compare(Cluster<MeanModel> o1, Cluster<MeanModel> o2) {
// Sort in descending order by size
return Integer.compare(o2.size(), o1.size());
}
});
int clusterBoundary = getClusterBoundary(relation, clusters);
List<? extends Cluster<MeanModel>> largeClusters = clusters.subList(0, clusterBoundary + 1);
List<? extends Cluster<MeanModel>> smallClusters = clusters.subList(clusterBoundary + 1, clusters.size());
LOG.beginStep(stepprog, 3, "Computing Cluster-Based Local Outlier Factors (CBLOF).");
WritableDoubleDataStore cblofs = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_DB);
DoubleMinMax cblofMinMax = new DoubleMinMax();
computeCBLOFs(relation, distance, cblofs, cblofMinMax, largeClusters, smallClusters);
LOG.setCompleted(stepprog);
DoubleRelation scoreResult = new MaterializedDoubleRelation("Cluster-Based Local Outlier Factor", "cblof-outlier", cblofs, ids);
OutlierScoreMeta scoreMeta = new QuotientOutlierScoreMeta(cblofMinMax.getMin(), cblofMinMax.getMax(), 0.0, Double.POSITIVE_INFINITY, 1.0);
return new OutlierResult(scoreMeta, scoreResult);
}
Aggregations