Search in sources :

Example 1 with SetDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.SetDBIDs in project elki by elki-project.

the class MaterializeKNNAndRKNNPreprocessor method objectsRemoved.

@Override
protected void objectsRemoved(DBIDs ids) {
    StepProgress stepprog = getLogger().isVerbose() ? new StepProgress(3) : null;
    // For debugging: valid DBIDs still in the database.
    final DBIDs valid = DBIDUtil.ensureSet(distanceQuery.getRelation().getDBIDs());
    ArrayDBIDs aids = DBIDUtil.ensureArray(ids);
    // delete the materialized (old) kNNs and RkNNs
    getLogger().beginStep(stepprog, 1, "New deletions ocurred, remove their materialized kNNs and RkNNs.");
    // Temporary storage of removed lists
    List<KNNList> kNNs = new ArrayList<>(ids.size());
    List<TreeSet<DoubleDBIDPair>> rkNNs = new ArrayList<>(ids.size());
    for (DBIDIter iter = aids.iter(); iter.valid(); iter.advance()) {
        kNNs.add(storage.get(iter));
        for (DBIDIter it = storage.get(iter).iter(); it.valid(); it.advance()) {
            if (!valid.contains(it) && !ids.contains(it)) {
                LOG.warning("False kNN: " + it);
            }
        }
        storage.delete(iter);
        rkNNs.add(materialized_RkNN.get(iter));
        for (DoubleDBIDPair it : materialized_RkNN.get(iter)) {
            if (!valid.contains(it) && !ids.contains(it)) {
                LOG.warning("False RkNN: " + it);
            }
        }
        materialized_RkNN.delete(iter);
    }
    // Keep only those IDs not also removed
    ArrayDBIDs kNN_ids = affectedkNN(kNNs, aids);
    ArrayDBIDs rkNN_ids = affectedRkNN(rkNNs, aids);
    // update the affected kNNs and RkNNs
    getLogger().beginStep(stepprog, 2, "New deletions ocurred, update the affected kNNs and RkNNs.");
    // Recompute the kNN for affected objects (in rkNN lists)
    {
        List<? extends KNNList> kNNList = knnQuery.getKNNForBulkDBIDs(rkNN_ids, k);
        int i = 0;
        for (DBIDIter reknn = rkNN_ids.iter(); reknn.valid(); reknn.advance(), i++) {
            if (kNNList.get(i) == null && !valid.contains(reknn)) {
                LOG.warning("BUG in online kNN/RkNN maintainance: " + DBIDUtil.toString(reknn) + " no longer in database.");
                continue;
            }
            assert (kNNList.get(i) != null);
            storage.put(reknn, kNNList.get(i));
            for (DoubleDBIDListIter it = kNNList.get(i).iter(); it.valid(); it.advance()) {
                materialized_RkNN.get(it).add(makePair(it, reknn));
            }
        }
    }
    // remove objects from RkNNs of objects (in kNN lists)
    {
        SetDBIDs idsSet = DBIDUtil.ensureSet(ids);
        for (DBIDIter nn = kNN_ids.iter(); nn.valid(); nn.advance()) {
            TreeSet<DoubleDBIDPair> rkNN = materialized_RkNN.get(nn);
            for (Iterator<DoubleDBIDPair> it = rkNN.iterator(); it.hasNext(); ) {
                if (idsSet.contains(it.next())) {
                    it.remove();
                }
            }
        }
    }
    // inform listener
    getLogger().beginStep(stepprog, 3, "New deletions ocurred, inform listeners.");
    fireKNNsRemoved(ids, rkNN_ids);
    getLogger().ensureCompleted(stepprog);
}
Also used : DoubleDBIDListIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) SetDBIDs(de.lmu.ifi.dbs.elki.database.ids.SetDBIDs) ArrayList(java.util.ArrayList) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) KNNList(de.lmu.ifi.dbs.elki.database.ids.KNNList) TreeSet(java.util.TreeSet) DoubleDBIDPair(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDPair) ArrayDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) ModifiableDoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList) KNNList(de.lmu.ifi.dbs.elki.database.ids.KNNList) List(java.util.List) DoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList) SetDBIDs(de.lmu.ifi.dbs.elki.database.ids.SetDBIDs)

Example 2 with SetDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.SetDBIDs in project elki by elki-project.

the class MaterializeKNNPreprocessor method updateKNNsAfterDeletion.

/**
 * Updates the kNNs of the RkNNs of the specified ids.
 *
 * @param ids the ids of deleted objects causing a change of materialized kNNs
 * @return the RkNNs of the specified ids, i.e. the kNNs which have been
 *         updated
 */
private ArrayDBIDs updateKNNsAfterDeletion(DBIDs ids) {
    SetDBIDs idsSet = DBIDUtil.ensureSet(ids);
    ArrayModifiableDBIDs rkNN_ids = DBIDUtil.newArray();
    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
        KNNList kNNs = storage.get(iditer);
        for (DBIDIter it = kNNs.iter(); it.valid(); it.advance()) {
            if (idsSet.contains(it)) {
                rkNN_ids.add(iditer);
                break;
            }
        }
    }
    // update the kNNs of the RkNNs
    List<? extends KNNList> kNNList = knnQuery.getKNNForBulkDBIDs(rkNN_ids, k);
    DBIDIter iter = rkNN_ids.iter();
    for (int i = 0; i < rkNN_ids.size(); i++, iter.advance()) {
        storage.put(iter, kNNList.get(i));
    }
    return rkNN_ids;
}
Also used : ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) KNNList(de.lmu.ifi.dbs.elki.database.ids.KNNList) SetDBIDs(de.lmu.ifi.dbs.elki.database.ids.SetDBIDs) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 3 with SetDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.SetDBIDs in project elki by elki-project.

the class P3C method run.

/**
 * Performs the P3C algorithm on the given Database.
 */
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    // Overall progress.
    StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
    if (stepProgress != null) {
        stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
    }
    // Desired number of bins, as per Sturge:
    final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
    // Perform 1-dimensional projections, and split into bins.
    SetDBIDs[][] partitions = partitionData(relation, binCount);
    if (stepProgress != null) {
        stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
    }
    // Set markers for each attribute until they're all deemed uniform.
    final long[][] markers = new long[dim][];
    for (int d = 0; d < dim; d++) {
        final SetDBIDs[] parts = partitions[d];
        if (parts == null) {
            // Never mark any on constant dimensions.
            continue;
        }
        final long[] marked = markers[d] = BitsUtil.zero(binCount);
        int card = 0;
        while (card < dim - 1) {
            // Find bin with largest support, test only the dimensions that were not
            // previously marked.
            int bestBin = chiSquaredUniformTest(parts, marked, card);
            if (bestBin < 0) {
                // Uniform
                break;
            }
            BitsUtil.setI(marked, bestBin);
            card++;
        }
        if (LOG.isDebugging()) {
            LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
        }
    }
    if (stepProgress != null) {
        stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
    }
    ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
    if (stepProgress != null) {
        stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
    }
    ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
    if (stepProgress != null) {
        stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
    }
    clusterCores = pruneRedundantClusterCores(clusterCores);
    if (LOG.isVerbose()) {
        LOG.verbose("Number of cluster cores found: " + clusterCores.size());
    }
    if (clusterCores.isEmpty()) {
        LOG.setCompleted(stepProgress);
        Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
        c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
        return c;
    }
    if (stepProgress != null) {
        stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
    }
    // Track objects not assigned to any cluster:
    ModifiableDBIDs noise = DBIDUtil.newHashSet();
    WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
    int k = clusterCores.size();
    List<MultivariateGaussianModel> models = new ArrayList<>(k);
    computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
    // Initial estimate of covariances, to assign noise objects
    EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
    assignUnassigned(relation, probClusterIGivenX, models, noise);
    double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
    for (int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
        final double emOld = emNew;
        EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
        // reassign probabilities
        emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
        if (LOG.isVerbose()) {
            LOG.verbose("iteration " + it + " - expectation value: " + emNew);
        }
        if ((emNew - emOld) <= emDelta) {
            break;
        }
    }
    if (stepProgress != null) {
        stepProgress.beginStep(6, "Generating hard clustering.", LOG);
    }
    // Create a hard clustering, making sure each data point only is part of one
    // cluster, based on the best match from the membership matrix.
    ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
    if (stepProgress != null) {
        stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
    }
    // Outlier detection. Remove points from clusters that have a Mahalanobis
    // distance larger than the critical value of the ChiSquare distribution.
    findOutliers(relation, models, clusterCandidates, noise);
    if (stepProgress != null) {
        stepProgress.beginStep(8, "Removing empty clusters.", LOG);
    }
    // Remove near-empty clusters.
    for (Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext(); ) {
        ClusterCandidate cand = it.next();
        final int size = cand.ids.size();
        if (size < minClusterSize) {
            if (size > 0) {
                noise.addDBIDs(cand.ids);
            }
            it.remove();
        }
    }
    if (LOG.isVerbose()) {
        LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
    }
    if (stepProgress != null) {
        stepProgress.beginStep(9, "Generating final result.", LOG);
    }
    // Generate final output.
    Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
    for (int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
        ClusterCandidate candidate = clusterCandidates.get(cluster);
        CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
        result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
    }
    LOG.verbose("Noise size: " + noise.size());
    if (noise.size() > 0) {
        result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
    }
    LOG.ensureCompleted(stepProgress);
    return result;
}
Also used : ArrayList(java.util.ArrayList) MultivariateGaussianModel(de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) SetDBIDs(de.lmu.ifi.dbs.elki.database.ids.SetDBIDs) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) StepProgress(de.lmu.ifi.dbs.elki.logging.progress.StepProgress) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)

Example 4 with SetDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.SetDBIDs in project elki by elki-project.

the class P3C method partitionData.

/**
 * Partition the data set into {@code bins} bins in each dimension
 * <i>independently</i>.
 *
 * This can be used to construct a grid approximation of the data using O(d n)
 * memory.
 *
 * When a dimension is found to be constant, it will not be partitioned, but
 * instead the corresponding array will be set to {@code null}.
 *
 * @param relation Data relation to partition
 * @param bins Number of bins
 * @return Partitions of each dimension.
 */
private SetDBIDs[][] partitionData(final Relation<V> relation, final int bins) {
    final int dim = RelationUtil.dimensionality(relation);
    SetDBIDs[][] partitions = new SetDBIDs[dim][bins];
    ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs());
    // will be reused.
    DBIDArrayIter iter = ids.iter();
    SortDBIDsBySingleDimension sorter = new VectorUtil.SortDBIDsBySingleDimension(relation, 0);
    for (int d = 0; d < dim; d++) {
        sorter.setDimension(d);
        ids.sort(sorter);
        // Minimum:
        iter.seek(0);
        double min = relation.get(iter).doubleValue(d);
        // Extend:
        iter.seek(ids.size() - 1);
        double delta = (relation.get(iter).doubleValue(d) - min) / bins;
        if (delta > 0.) {
            SetDBIDs[] dimparts = partitions[d];
            double split = min + delta;
            HashSetModifiableDBIDs pids = DBIDUtil.newHashSet();
            dimparts[0] = pids;
            int i = 0;
            for (iter.seek(0); iter.valid(); iter.advance()) {
                final double v = relation.get(iter).doubleValue(d);
                if (v <= split || i == dimparts.length - 1) {
                    pids.add(iter);
                } else {
                    i++;
                    split += delta;
                    pids = DBIDUtil.newHashSet();
                    dimparts[i] = pids;
                }
            }
            for (++i; i < dimparts.length; ++i) {
                dimparts[i] = pids;
            }
        } else {
            // Flag whole dimension as bad
            partitions[d] = null;
        }
    }
    return partitions;
}
Also used : HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) SortDBIDsBySingleDimension(de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) SetDBIDs(de.lmu.ifi.dbs.elki.database.ids.SetDBIDs)

Example 5 with SetDBIDs

use of de.lmu.ifi.dbs.elki.database.ids.SetDBIDs in project elki by elki-project.

the class OutlierROCCurve method processNewResult.

@Override
public void processNewResult(ResultHierarchy hier, Result result) {
    Database db = ResultUtil.findDatabase(hier);
    // Prepare
    SetDBIDs positiveids = DBIDUtil.ensureSet(DatabaseUtil.getObjectsByLabelMatch(db, positiveClassName));
    if (positiveids.size() == 0) {
        LOG.warning("Computing a ROC curve failed - no objects matched.");
        return;
    }
    boolean nonefound = true;
    List<OutlierResult> oresults = OutlierResult.getOutlierResults(result);
    List<OrderingResult> orderings = ResultUtil.getOrderingResults(result);
    // Outlier results are the main use case.
    for (OutlierResult o : oresults) {
        ROCResult rocres = computeROCResult(o.getScores().size(), positiveids, o);
        db.getHierarchy().add(o, rocres);
        EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), o, "Evaluation of ranking", "ranking-evaluation");
        MeasurementGroup g = ev.findOrCreateGroup("Evaluation measures");
        if (!g.hasMeasure(ROCAUC_LABEL)) {
            g.addMeasure(ROCAUC_LABEL, rocres.auc, 0., 1., false);
        }
        // Process each ordering only once.
        orderings.remove(o.getOrdering());
        nonefound = false;
    }
    // otherwise apply an ordering to the database IDs.
    for (OrderingResult or : orderings) {
        DBIDs sorted = or.order(or.getDBIDs());
        ROCResult rocres = computeROCResult(or.getDBIDs().size(), positiveids, sorted);
        db.getHierarchy().add(or, rocres);
        EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), or, "Evaluation of ranking", "ranking-evaluation");
        MeasurementGroup g = ev.findOrCreateGroup("Evaluation measures");
        if (!g.hasMeasure(ROCAUC_LABEL)) {
            g.addMeasure(ROCAUC_LABEL, rocres.auc, 0., 1., false);
        }
        nonefound = false;
    }
    if (nonefound) {
        return;
    // logger.warning("No results found to process with ROC curve analyzer.
    // Got "+iterables.size()+" iterables, "+orderings.size()+" orderings.");
    }
}
Also used : DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) SetDBIDs(de.lmu.ifi.dbs.elki.database.ids.SetDBIDs) Database(de.lmu.ifi.dbs.elki.database.Database) OutlierResult(de.lmu.ifi.dbs.elki.result.outlier.OutlierResult) MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) SetDBIDs(de.lmu.ifi.dbs.elki.database.ids.SetDBIDs)

Aggregations

SetDBIDs (de.lmu.ifi.dbs.elki.database.ids.SetDBIDs)11 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)6 Database (de.lmu.ifi.dbs.elki.database.Database)5 ArrayModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs)5 OutlierResult (de.lmu.ifi.dbs.elki.result.outlier.OutlierResult)5 HashSetModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs)4 ArrayList (java.util.ArrayList)3 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)2 KNNList (de.lmu.ifi.dbs.elki.database.ids.KNNList)2 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)2 StepProgress (de.lmu.ifi.dbs.elki.logging.progress.StepProgress)2 OrderingResult (de.lmu.ifi.dbs.elki.result.OrderingResult)2 MultivariateGaussianModel (de.lmu.ifi.dbs.elki.algorithm.clustering.em.MultivariateGaussianModel)1 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)1 Subspace (de.lmu.ifi.dbs.elki.data.Subspace)1 SortDBIDsBySingleDimension (de.lmu.ifi.dbs.elki.data.VectorUtil.SortDBIDsBySingleDimension)1 SubspaceModel (de.lmu.ifi.dbs.elki.data.model.SubspaceModel)1 ArrayDBIDs (de.lmu.ifi.dbs.elki.database.ids.ArrayDBIDs)1 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)1 DoubleDBIDList (de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList)1