use of de.lmu.ifi.dbs.elki.database.ids.SetDBIDs in project elki by elki-project.
the class MaterializeKNNAndRKNNPreprocessor method objectsRemoved.
@Override
protected void objectsRemoved(DBIDs ids) {
StepProgress stepprog = getLogger().isVerbose() ? new StepProgress(3) : null;
// For debugging: valid DBIDs still in the database.
final DBIDs valid = DBIDUtil.ensureSet(distanceQuery.getRelation().getDBIDs());
ArrayDBIDs aids = DBIDUtil.ensureArray(ids);
// delete the materialized (old) kNNs and RkNNs
getLogger().beginStep(stepprog, 1, "New deletions ocurred, remove their materialized kNNs and RkNNs.");
// Temporary storage of removed lists
List<KNNList> kNNs = new ArrayList<>(ids.size());
List<TreeSet<DoubleDBIDPair>> rkNNs = new ArrayList<>(ids.size());
for (DBIDIter iter = aids.iter(); iter.valid(); iter.advance()) {
kNNs.add(storage.get(iter));
for (DBIDIter it = storage.get(iter).iter(); it.valid(); it.advance()) {
if (!valid.contains(it) && !ids.contains(it)) {
LOG.warning("False kNN: " + it);
}
}
storage.delete(iter);
rkNNs.add(materialized_RkNN.get(iter));
for (DoubleDBIDPair it : materialized_RkNN.get(iter)) {
if (!valid.contains(it) && !ids.contains(it)) {
LOG.warning("False RkNN: " + it);
}
}
materialized_RkNN.delete(iter);
}
// Keep only those IDs not also removed
ArrayDBIDs kNN_ids = affectedkNN(kNNs, aids);
ArrayDBIDs rkNN_ids = affectedRkNN(rkNNs, aids);
// update the affected kNNs and RkNNs
getLogger().beginStep(stepprog, 2, "New deletions ocurred, update the affected kNNs and RkNNs.");
// Recompute the kNN for affected objects (in rkNN lists)
{
List<? extends KNNList> kNNList = knnQuery.getKNNForBulkDBIDs(rkNN_ids, k);
int i = 0;
for (DBIDIter reknn = rkNN_ids.iter(); reknn.valid(); reknn.advance(), i++) {
if (kNNList.get(i) == null && !valid.contains(reknn)) {
LOG.warning("BUG in online kNN/RkNN maintainance: " + DBIDUtil.toString(reknn) + " no longer in database.");
continue;
}
assert (kNNList.get(i) != null);
storage.put(reknn, kNNList.get(i));
for (DoubleDBIDListIter it = kNNList.get(i).iter(); it.valid(); it.advance()) {
materialized_RkNN.get(it).add(makePair(it, reknn));
}
}
}
// remove objects from RkNNs of objects (in kNN lists)
{
SetDBIDs idsSet = DBIDUtil.ensureSet(ids);
for (DBIDIter nn = kNN_ids.iter(); nn.valid(); nn.advance()) {
TreeSet<DoubleDBIDPair> rkNN = materialized_RkNN.get(nn);
for (Iterator<DoubleDBIDPair> it = rkNN.iterator(); it.hasNext(); ) {
if (idsSet.contains(it.next())) {
it.remove();
}
}
}
}
// inform listener
getLogger().beginStep(stepprog, 3, "New deletions ocurred, inform listeners.");
fireKNNsRemoved(ids, rkNN_ids);
getLogger().ensureCompleted(stepprog);
}
use of de.lmu.ifi.dbs.elki.database.ids.SetDBIDs in project elki by elki-project.
the class MaterializeKNNPreprocessor method updateKNNsAfterDeletion.
/**
* Updates the kNNs of the RkNNs of the specified ids.
*
* @param ids the ids of deleted objects causing a change of materialized kNNs
* @return the RkNNs of the specified ids, i.e. the kNNs which have been
* updated
*/
private ArrayDBIDs updateKNNsAfterDeletion(DBIDs ids) {
SetDBIDs idsSet = DBIDUtil.ensureSet(ids);
ArrayModifiableDBIDs rkNN_ids = DBIDUtil.newArray();
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
KNNList kNNs = storage.get(iditer);
for (DBIDIter it = kNNs.iter(); it.valid(); it.advance()) {
if (idsSet.contains(it)) {
rkNN_ids.add(iditer);
break;
}
}
}
// update the kNNs of the RkNNs
List<? extends KNNList> kNNList = knnQuery.getKNNForBulkDBIDs(rkNN_ids, k);
DBIDIter iter = rkNN_ids.iter();
for (int i = 0; i < rkNN_ids.size(); i++, iter.advance()) {
storage.put(iter, kNNList.get(i));
}
return rkNN_ids;
}
use of de.lmu.ifi.dbs.elki.database.ids.SetDBIDs in project elki by elki-project.
the class P3C method run.
/**
* Performs the P3C algorithm on the given Database.
*/
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
final int dim = RelationUtil.dimensionality(relation);
// Overall progress.
StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
if (stepProgress != null) {
stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
}
// Desired number of bins, as per Sturge:
final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
// Perform 1-dimensional projections, and split into bins.
SetDBIDs[][] partitions = partitionData(relation, binCount);
if (stepProgress != null) {
stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
}
// Set markers for each attribute until they're all deemed uniform.
final long[][] markers = new long[dim][];
for (int d = 0; d < dim; d++) {
final SetDBIDs[] parts = partitions[d];
if (parts == null) {
// Never mark any on constant dimensions.
continue;
}
final long[] marked = markers[d] = BitsUtil.zero(binCount);
int card = 0;
while (card < dim - 1) {
// Find bin with largest support, test only the dimensions that were not
// previously marked.
int bestBin = chiSquaredUniformTest(parts, marked, card);
if (bestBin < 0) {
// Uniform
break;
}
BitsUtil.setI(marked, bestBin);
card++;
}
if (LOG.isDebugging()) {
LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
}
}
if (stepProgress != null) {
stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
}
ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
if (stepProgress != null) {
stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
}
ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
if (stepProgress != null) {
stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
}
clusterCores = pruneRedundantClusterCores(clusterCores);
if (LOG.isVerbose()) {
LOG.verbose("Number of cluster cores found: " + clusterCores.size());
}
if (clusterCores.isEmpty()) {
LOG.setCompleted(stepProgress);
Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
return c;
}
if (stepProgress != null) {
stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
}
// Track objects not assigned to any cluster:
ModifiableDBIDs noise = DBIDUtil.newHashSet();
WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
int k = clusterCores.size();
List<MultivariateGaussianModel> models = new ArrayList<>(k);
computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
// Initial estimate of covariances, to assign noise objects
EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
assignUnassigned(relation, probClusterIGivenX, models, noise);
double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
for (int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
final double emOld = emNew;
EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
// reassign probabilities
emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
if (LOG.isVerbose()) {
LOG.verbose("iteration " + it + " - expectation value: " + emNew);
}
if ((emNew - emOld) <= emDelta) {
break;
}
}
if (stepProgress != null) {
stepProgress.beginStep(6, "Generating hard clustering.", LOG);
}
// Create a hard clustering, making sure each data point only is part of one
// cluster, based on the best match from the membership matrix.
ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
if (stepProgress != null) {
stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
}
// Outlier detection. Remove points from clusters that have a Mahalanobis
// distance larger than the critical value of the ChiSquare distribution.
findOutliers(relation, models, clusterCandidates, noise);
if (stepProgress != null) {
stepProgress.beginStep(8, "Removing empty clusters.", LOG);
}
// Remove near-empty clusters.
for (Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext(); ) {
ClusterCandidate cand = it.next();
final int size = cand.ids.size();
if (size < minClusterSize) {
if (size > 0) {
noise.addDBIDs(cand.ids);
}
it.remove();
}
}
if (LOG.isVerbose()) {
LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
}
if (stepProgress != null) {
stepProgress.beginStep(9, "Generating final result.", LOG);
}
// Generate final output.
Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
for (int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
ClusterCandidate candidate = clusterCandidates.get(cluster);
CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
}
LOG.verbose("Noise size: " + noise.size());
if (noise.size() > 0) {
result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
}
LOG.ensureCompleted(stepProgress);
return result;
}
use of de.lmu.ifi.dbs.elki.database.ids.SetDBIDs in project elki by elki-project.
the class P3C method partitionData.
/**
* Partition the data set into {@code bins} bins in each dimension
* <i>independently</i>.
*
* This can be used to construct a grid approximation of the data using O(d n)
* memory.
*
* When a dimension is found to be constant, it will not be partitioned, but
* instead the corresponding array will be set to {@code null}.
*
* @param relation Data relation to partition
* @param bins Number of bins
* @return Partitions of each dimension.
*/
private SetDBIDs[][] partitionData(final Relation<V> relation, final int bins) {
final int dim = RelationUtil.dimensionality(relation);
SetDBIDs[][] partitions = new SetDBIDs[dim][bins];
ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs());
// will be reused.
DBIDArrayIter iter = ids.iter();
SortDBIDsBySingleDimension sorter = new VectorUtil.SortDBIDsBySingleDimension(relation, 0);
for (int d = 0; d < dim; d++) {
sorter.setDimension(d);
ids.sort(sorter);
// Minimum:
iter.seek(0);
double min = relation.get(iter).doubleValue(d);
// Extend:
iter.seek(ids.size() - 1);
double delta = (relation.get(iter).doubleValue(d) - min) / bins;
if (delta > 0.) {
SetDBIDs[] dimparts = partitions[d];
double split = min + delta;
HashSetModifiableDBIDs pids = DBIDUtil.newHashSet();
dimparts[0] = pids;
int i = 0;
for (iter.seek(0); iter.valid(); iter.advance()) {
final double v = relation.get(iter).doubleValue(d);
if (v <= split || i == dimparts.length - 1) {
pids.add(iter);
} else {
i++;
split += delta;
pids = DBIDUtil.newHashSet();
dimparts[i] = pids;
}
}
for (++i; i < dimparts.length; ++i) {
dimparts[i] = pids;
}
} else {
// Flag whole dimension as bad
partitions[d] = null;
}
}
return partitions;
}
use of de.lmu.ifi.dbs.elki.database.ids.SetDBIDs in project elki by elki-project.
the class OutlierROCCurve method processNewResult.
@Override
public void processNewResult(ResultHierarchy hier, Result result) {
Database db = ResultUtil.findDatabase(hier);
// Prepare
SetDBIDs positiveids = DBIDUtil.ensureSet(DatabaseUtil.getObjectsByLabelMatch(db, positiveClassName));
if (positiveids.size() == 0) {
LOG.warning("Computing a ROC curve failed - no objects matched.");
return;
}
boolean nonefound = true;
List<OutlierResult> oresults = OutlierResult.getOutlierResults(result);
List<OrderingResult> orderings = ResultUtil.getOrderingResults(result);
// Outlier results are the main use case.
for (OutlierResult o : oresults) {
ROCResult rocres = computeROCResult(o.getScores().size(), positiveids, o);
db.getHierarchy().add(o, rocres);
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), o, "Evaluation of ranking", "ranking-evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Evaluation measures");
if (!g.hasMeasure(ROCAUC_LABEL)) {
g.addMeasure(ROCAUC_LABEL, rocres.auc, 0., 1., false);
}
// Process each ordering only once.
orderings.remove(o.getOrdering());
nonefound = false;
}
// otherwise apply an ordering to the database IDs.
for (OrderingResult or : orderings) {
DBIDs sorted = or.order(or.getDBIDs());
ROCResult rocres = computeROCResult(or.getDBIDs().size(), positiveids, sorted);
db.getHierarchy().add(or, rocres);
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), or, "Evaluation of ranking", "ranking-evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Evaluation measures");
if (!g.hasMeasure(ROCAUC_LABEL)) {
g.addMeasure(ROCAUC_LABEL, rocres.auc, 0., 1., false);
}
nonefound = false;
}
if (nonefound) {
return;
// logger.warning("No results found to process with ROC curve analyzer.
// Got "+iterables.size()+" iterables, "+orderings.size()+" orderings.");
}
}
Aggregations