use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class COP method run.
/**
* Process a single relation.
*
* @param relation Relation to process
* @return Outlier detection result
*/
public OutlierResult run(Relation<V> relation) {
final DBIDs ids = relation.getDBIDs();
KNNQuery<V> knnQuery = QueryUtil.getKNNQuery(relation, getDistanceFunction(), k + 1);
final int dim = RelationUtil.dimensionality(relation);
if (k <= dim + 1) {
LOG.warning("PCA is underspecified with a too low k! k should be at much larger than " + dim);
}
WritableDoubleDataStore cop_score = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC);
WritableDataStore<double[]> cop_err_v = null;
WritableIntegerDataStore cop_dim = null;
if (models) {
cop_err_v = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, double[].class);
cop_dim = DataStoreUtil.makeIntegerStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_STATIC, -1);
}
// compute neighbors of each db object
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Correlation Outlier Probabilities", relation.size(), LOG) : null;
for (DBIDIter id = ids.iter(); id.valid(); id.advance()) {
KNNList neighbors = knnQuery.getKNNForDBID(id, k + 1);
ModifiableDBIDs nids = DBIDUtil.newHashSet(neighbors);
// Do not use query object
nids.remove(id);
double[] centroid = Centroid.make(relation, nids).getArrayRef();
double[] relative = minusEquals(relation.get(id).toArray(), centroid);
PCAResult pcares = pca.processIds(nids, relation);
double[][] evecs = pcares.getEigenvectors();
double[] projected = transposeTimes(evecs, relative);
double[] evs = pcares.getEigenvalues();
double min = Double.POSITIVE_INFINITY;
int vdim = dim;
switch(dist) {
case CHISQUARED:
{
double sqdevs = 0;
for (int d = 0; d < dim; d++) {
// Scale with Stddev
double dev = projected[d];
// Accumulate
sqdevs += dev * dev / evs[d];
// Evaluate
double score = 1 - ChiSquaredDistribution.cdf(sqdevs, d + 1);
if (score < min) {
min = score;
vdim = d + 1;
}
}
break;
}
case GAMMA:
{
double[][] dists = new double[dim][nids.size()];
int j = 0;
double[] srel = new double[dim];
for (DBIDIter s = nids.iter(); s.valid() && j < nids.size(); s.advance()) {
V vec = relation.get(s);
for (int d = 0; d < dim; d++) {
srel[d] = vec.doubleValue(d) - centroid[d];
}
double[] serr = transposeTimes(evecs, srel);
double sqdist = 0.0;
for (int d = 0; d < dim; d++) {
double serrd = serr[d];
sqdist += serrd * serrd / evs[d];
dists[d][j] = sqdist;
}
j++;
}
double sqdevs = 0;
for (int d = 0; d < dim; d++) {
// Scale with Stddev
final double dev = projected[d];
// Accumulate
sqdevs += dev * dev / evs[d];
// Sort, so we can trim the top 15% below.
Arrays.sort(dists[d]);
// Evaluate
double score = 1 - GammaChoiWetteEstimator.STATIC.estimate(dists[d], SHORTENED_ARRAY).cdf(sqdevs);
if (score < min) {
min = score;
vdim = d + 1;
}
}
break;
}
}
// Normalize the value
final double prob = expect * (1 - min) / (expect + min);
// Construct the error vector:
for (int d = vdim; d < dim; d++) {
projected[d] = 0.;
}
double[] ev = timesEquals(times(evecs, projected), -1 * prob);
cop_score.putDouble(id, prob);
if (models) {
cop_err_v.put(id, ev);
cop_dim.putInt(id, dim + 1 - vdim);
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
// combine results.
DoubleRelation scoreResult = new MaterializedDoubleRelation("Correlation Outlier Probabilities", COP_SCORES, cop_score, ids);
OutlierScoreMeta scoreMeta = new ProbabilisticOutlierScore();
OutlierResult result = new OutlierResult(scoreMeta, scoreResult);
if (models) {
result.addChildResult(new MaterializedRelation<>("Local Dimensionality", COP_DIM, TypeUtil.INTEGER, cop_dim, ids));
result.addChildResult(new MaterializedRelation<>("Error vectors", COP_ERRORVEC, TypeUtil.DOUBLE_ARRAY, cop_err_v, ids));
}
return result;
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class SharedNearestNeighborPreprocessor method initialize.
@Override
public void initialize() {
if (getLogger().isVerbose()) {
getLogger().verbose("Assigning nearest neighbor lists to database objects");
}
storage = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, ArrayDBIDs.class);
KNNQuery<O> knnquery = QueryUtil.getKNNQuery(relation, distanceFunction, numberOfNeighbors);
FiniteProgress progress = getLogger().isVerbose() ? new FiniteProgress("assigning nearest neighbor lists", relation.size(), getLogger()) : null;
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
ArrayModifiableDBIDs neighbors = DBIDUtil.newArray(numberOfNeighbors);
DBIDs kNN = knnquery.getKNNForDBID(iditer, numberOfNeighbors);
for (DBIDIter iter = kNN.iter(); iter.valid(); iter.advance()) {
// if(!id.equals(nid)) {
neighbors.add(iter);
// Size limitation to exactly numberOfNeighbors
if (neighbors.size() >= numberOfNeighbors) {
break;
}
}
neighbors.sort();
storage.put(iditer, neighbors);
getLogger().incrementProcessed(progress);
}
getLogger().ensureCompleted(progress);
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class MaterializedKNNAndRKNNPreprocessorTest method testPreprocessor.
@Test
public void testPreprocessor() {
UpdatableDatabase db;
// get database
try (InputStream is = AbstractSimpleAlgorithmTest.open(dataset)) {
ListParameterization params = new ListParameterization();
// Setup parser and data loading
NumberVectorLabelParser<DoubleVector> parser = new NumberVectorLabelParser<>(DoubleVector.FACTORY);
InputStreamDatabaseConnection dbc = new InputStreamDatabaseConnection(is, new ArrayList<>(), parser);
// We want to allow the use of indexes via "params"
params.addParameter(AbstractDatabase.Parameterizer.DATABASE_CONNECTION_ID, dbc);
db = ClassGenericsUtil.parameterizeOrAbort(HashmapDatabase.class, params);
db.initialize();
} catch (IOException e) {
fail("Test data " + dataset + " not found.");
return;
}
Relation<DoubleVector> rep = db.getRelation(TypeUtil.DOUBLE_VECTOR_FIELD);
DistanceQuery<DoubleVector> distanceQuery = db.getDistanceQuery(rep, EuclideanDistanceFunction.STATIC);
// verify data set size.
assertEquals("Data set size doesn't match parameters.", shoulds, rep.size());
// get linear queries
LinearScanDistanceKNNQuery<DoubleVector> lin_knn_query = new LinearScanDistanceKNNQuery<>(distanceQuery);
LinearScanRKNNQuery<DoubleVector> lin_rknn_query = new LinearScanRKNNQuery<>(distanceQuery, lin_knn_query, k);
// get preprocessed queries
ListParameterization config = new ListParameterization();
config.addParameter(MaterializeKNNPreprocessor.Factory.DISTANCE_FUNCTION_ID, distanceQuery.getDistanceFunction());
config.addParameter(MaterializeKNNPreprocessor.Factory.K_ID, k);
MaterializeKNNAndRKNNPreprocessor<DoubleVector> preproc = new MaterializeKNNAndRKNNPreprocessor<>(rep, distanceQuery.getDistanceFunction(), k);
KNNQuery<DoubleVector> preproc_knn_query = preproc.getKNNQuery(distanceQuery, k);
RKNNQuery<DoubleVector> preproc_rknn_query = preproc.getRKNNQuery(distanceQuery);
// add as index
db.getHierarchy().add(rep, preproc);
assertFalse("Preprocessor knn query class incorrect.", preproc_knn_query instanceof LinearScanDistanceKNNQuery);
assertFalse("Preprocessor rknn query class incorrect.", preproc_rknn_query instanceof LinearScanDistanceKNNQuery);
// test queries
testKNNQueries(rep, lin_knn_query, preproc_knn_query, k);
testRKNNQueries(rep, lin_rknn_query, preproc_rknn_query, k);
// also test partial queries, forward only
testKNNQueries(rep, lin_knn_query, preproc_knn_query, k / 2);
// insert new objects
List<DoubleVector> insertions = new ArrayList<>();
NumberVector.Factory<DoubleVector> o = RelationUtil.getNumberVectorFactory(rep);
int dim = RelationUtil.dimensionality(rep);
Random random = new Random(seed);
for (int i = 0; i < updatesize; i++) {
DoubleVector obj = VectorUtil.randomVector(o, dim, random);
insertions.add(obj);
}
// System.out.println("Insert " + insertions);
DBIDs deletions = db.insert(MultipleObjectsBundle.makeSimple(rep.getDataTypeInformation(), insertions));
// test queries
testKNNQueries(rep, lin_knn_query, preproc_knn_query, k);
testRKNNQueries(rep, lin_rknn_query, preproc_rknn_query, k);
// delete objects
// System.out.println("Delete " + deletions);
db.delete(deletions);
// test queries
testKNNQueries(rep, lin_knn_query, preproc_knn_query, k);
testRKNNQueries(rep, lin_rknn_query, preproc_rknn_query, k);
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class SilhouetteOutlierDetection method run.
@Override
public OutlierResult run(Database database) {
Relation<O> relation = database.getRelation(getDistanceFunction().getInputTypeRestriction());
DistanceQuery<O> dq = database.getDistanceQuery(relation, getDistanceFunction());
// TODO: improve ELKI api to ensure we're using the same DBIDs!
Clustering<?> c = clusterer.run(database);
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_DB);
DoubleMinMax mm = new DoubleMinMax();
List<? extends Cluster<?>> clusters = c.getAllClusters();
for (Cluster<?> cluster : clusters) {
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
case TREAT_NOISE_AS_SINGLETONS:
// As suggested in Rousseeuw, we use 0 for singletons.
for (DBIDIter iter = cluster.getIDs().iter(); iter.valid(); iter.advance()) {
scores.put(iter, 0.);
}
mm.put(0.);
continue;
case MERGE_NOISE:
// Treat as cluster below
break;
}
}
ArrayDBIDs ids = DBIDUtil.ensureArray(cluster.getIDs());
// temporary storage.
double[] as = new double[ids.size()];
DBIDArrayIter it1 = ids.iter(), it2 = ids.iter();
for (it1.seek(0); it1.valid(); it1.advance()) {
// a: In-cluster distances
// Already computed distances
double a = as[it1.getOffset()];
for (it2.seek(it1.getOffset() + 1); it2.valid(); it2.advance()) {
final double dist = dq.distance(it1, it2);
a += dist;
as[it2.getOffset()] += dist;
}
a /= (ids.size() - 1);
// b: other clusters:
double min = Double.POSITIVE_INFINITY;
for (Cluster<?> ocluster : clusters) {
if (ocluster == /* yes, reference identity */
cluster) {
continue;
}
if (ocluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
continue;
case MERGE_NOISE:
// No special treatment
break;
case TREAT_NOISE_AS_SINGLETONS:
// Treat noise cluster as singletons:
for (DBIDIter it3 = ocluster.getIDs().iter(); it3.valid(); it3.advance()) {
double dist = dq.distance(it1, it3);
if (dist < min) {
min = dist;
}
}
continue;
}
}
final DBIDs oids = ocluster.getIDs();
double b = 0.;
for (DBIDIter it3 = oids.iter(); it3.valid(); it3.advance()) {
b += dq.distance(it1, it3);
}
b /= oids.size();
if (b < min) {
min = b;
}
}
final double score = (min - a) / Math.max(min, a);
scores.put(it1, score);
mm.put(score);
}
}
// Build result representation.
DoubleRelation scoreResult = new MaterializedDoubleRelation("Silhouette Coefficients", "silhouette-outlier", scores, relation.getDBIDs());
OutlierScoreMeta scoreMeta = new InvertedOutlierScoreMeta(mm.getMin(), mm.getMax(), -1., 1., .5);
return new OutlierResult(scoreMeta, scoreResult);
}
use of de.lmu.ifi.dbs.elki.database.ids.DBIDs in project elki by elki-project.
the class OutRankS1 method run.
@Override
public OutlierResult run(Database database) {
DBIDs ids = database.getRelation(TypeUtil.ANY).getDBIDs();
// Run the primary algorithm
Clustering<? extends SubspaceModel> clustering = clusteralg.run(database);
WritableDoubleDataStore score = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT);
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
score.putDouble(iter, 0);
}
int maxdim = 0, maxsize = 0;
// Find maximum dimensionality and cluster size
for (Cluster<? extends SubspaceModel> cluster : clustering.getAllClusters()) {
maxsize = Math.max(maxsize, cluster.size());
maxdim = Math.max(maxdim, BitsUtil.cardinality(cluster.getModel().getDimensions()));
}
// Iterate over all clusters:
DoubleMinMax minmax = new DoubleMinMax();
for (Cluster<? extends SubspaceModel> cluster : clustering.getAllClusters()) {
double relsize = cluster.size() / (double) maxsize;
double reldim = BitsUtil.cardinality(cluster.getModel().getDimensions()) / (double) maxdim;
// Process objects in the cluster
for (DBIDIter iter = cluster.getIDs().iter(); iter.valid(); iter.advance()) {
double newscore = score.doubleValue(iter) + alpha * relsize + (1 - alpha) * reldim;
score.putDouble(iter, newscore);
minmax.put(newscore);
}
}
DoubleRelation scoreResult = new MaterializedDoubleRelation("OutRank-S1", "OUTRANK_S1", score, ids);
OutlierScoreMeta meta = new InvertedOutlierScoreMeta(minmax.getMin(), minmax.getMax(), 0, Double.POSITIVE_INFINITY);
OutlierResult res = new OutlierResult(meta, scoreResult);
res.addChildResult(clustering);
return res;
}
Aggregations