Search in sources :

Example 1 with DoubleDBIDListMIter

use of de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListMIter in project elki by elki-project.

the class GreedyEnsembleExperiment method run.

@Override
public void run() {
    // Note: the database contains the *result vectors*, not the original data.
    final Database database = inputstep.getDatabase();
    Relation<NumberVector> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD);
    final Relation<String> labels = DatabaseUtil.guessLabelRepresentation(database);
    final DBID firstid = DBIDUtil.deref(labels.iterDBIDs());
    final String firstlabel = labels.get(firstid);
    if (!firstlabel.matches("bylabel")) {
        throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!");
    }
    relation = applyPrescaling(prescaling, relation, firstid);
    final int numcand = relation.size() - 1;
    // Dimensionality and reference vector
    final int dim = RelationUtil.dimensionality(relation);
    final NumberVector refvec = relation.get(firstid);
    // Build the positive index set for ROC AUC.
    VectorNonZero positive = new VectorNonZero(refvec);
    final int desired_outliers = (int) (rate * dim);
    int union_outliers = 0;
    final int[] outliers_seen = new int[dim];
    // Merge the top-k for each ensemble member, until we have enough
    // candidates.
    {
        int k = 0;
        ArrayList<DecreasingVectorIter> iters = new ArrayList<>(numcand);
        if (minvote >= numcand) {
            minvote = Math.max(1, numcand - 1);
        }
        for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
            // Skip "by label", obviously
            if (DBIDUtil.equal(firstid, iditer)) {
                continue;
            }
            iters.add(new DecreasingVectorIter(relation.get(iditer)));
        }
        loop: while (union_outliers < desired_outliers) {
            for (DecreasingVectorIter iter : iters) {
                if (!iter.valid()) {
                    LOG.warning("Union_outliers=" + union_outliers + " < desired_outliers=" + desired_outliers + " minvote=" + minvote);
                    break loop;
                }
                int cur = iter.dim();
                outliers_seen[cur] += 1;
                if (outliers_seen[cur] == minvote) {
                    union_outliers += 1;
                }
                iter.advance();
            }
            k++;
        }
        LOG.verbose("Merged top " + k + " outliers to: " + union_outliers + " outliers (desired: at least " + desired_outliers + ")");
    }
    // Build the final weight vector.
    final double[] estimated_weights = new double[dim];
    final double[] estimated_truth = new double[dim];
    updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
    DoubleVector estimated_truth_vec = DoubleVector.wrap(estimated_truth);
    PrimitiveDistanceFunction<NumberVector> wdist = getDistanceFunction(estimated_weights);
    PrimitiveDistanceFunction<NumberVector> tdist = wdist;
    // Build the naive ensemble:
    final double[] naiveensemble = new double[dim];
    {
        double[] buf = new double[numcand];
        for (int d = 0; d < dim; d++) {
            int i = 0;
            for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
                if (DBIDUtil.equal(firstid, iditer)) {
                    continue;
                }
                final NumberVector vec = relation.get(iditer);
                buf[i] = vec.doubleValue(d);
                i++;
            }
            naiveensemble[d] = voting.combine(buf, i);
            if (Double.isNaN(naiveensemble[d])) {
                LOG.warning("NaN after combining: " + FormatUtil.format(buf) + " i=" + i + " " + voting.toString());
            }
        }
    }
    DoubleVector naivevec = DoubleVector.wrap(naiveensemble);
    // Compute single AUC scores and estimations.
    // Remember the method most similar to the estimation
    double bestauc = 0.0;
    String bestaucstr = "";
    double bestcost = Double.POSITIVE_INFINITY;
    String bestcoststr = "";
    DBID bestid = null;
    double bestest = Double.POSITIVE_INFINITY;
    {
        final double[] greedyensemble = new double[dim];
        // Compute individual scores
        for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
            if (DBIDUtil.equal(firstid, iditer)) {
                continue;
            }
            // fout.append(labels.get(id));
            final NumberVector vec = relation.get(iditer);
            singleEnsemble(greedyensemble, vec);
            double auc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(DoubleVector.wrap(greedyensemble)));
            double estimated = wdist.distance(DoubleVector.wrap(greedyensemble), estimated_truth_vec);
            double cost = tdist.distance(DoubleVector.wrap(greedyensemble), refvec);
            LOG.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(iditer));
            if (auc > bestauc) {
                bestauc = auc;
                bestaucstr = labels.get(iditer);
            }
            if (cost < bestcost) {
                bestcost = cost;
                bestcoststr = labels.get(iditer);
            }
            if (estimated < bestest || bestid == null) {
                bestest = estimated;
                bestid = DBIDUtil.deref(iditer);
            }
        }
    }
    // Initialize ensemble with "best" method
    if (prescaling != null) {
        LOG.verbose("Input prescaling: " + prescaling);
    }
    LOG.verbose("Distance function: " + wdist);
    LOG.verbose("Ensemble voting: " + voting);
    if (scaling != null) {
        LOG.verbose("Ensemble rescaling: " + scaling);
    }
    LOG.verbose("Initial estimation of outliers: " + union_outliers);
    LOG.verbose("Initializing ensemble with: " + labels.get(bestid));
    ModifiableDBIDs ensemble = DBIDUtil.newArray(bestid);
    ModifiableDBIDs enscands = DBIDUtil.newHashSet(relation.getDBIDs());
    ModifiableDBIDs dropped = DBIDUtil.newHashSet(relation.size());
    dropped.add(firstid);
    enscands.remove(bestid);
    enscands.remove(firstid);
    final double[] greedyensemble = new double[dim];
    singleEnsemble(greedyensemble, relation.get(bestid));
    // Greedily grow the ensemble
    final double[] testensemble = new double[dim];
    while (enscands.size() > 0) {
        NumberVector greedyvec = DoubleVector.wrap(greedyensemble);
        final double oldd = wdist.distance(estimated_truth_vec, greedyvec);
        final int heapsize = enscands.size();
        ModifiableDoubleDBIDList heap = DBIDUtil.newDistanceDBIDList(heapsize);
        double[] tmp = new double[dim];
        for (DBIDIter iter = enscands.iter(); iter.valid(); iter.advance()) {
            final NumberVector vec = relation.get(iter);
            singleEnsemble(tmp, vec);
            double diversity = wdist.distance(DoubleVector.wrap(greedyensemble), greedyvec);
            heap.add(diversity, iter);
        }
        heap.sort();
        for (DoubleDBIDListMIter it = heap.iter(); heap.size() > 0; it.remove()) {
            // Last
            it.seek(heap.size() - 1);
            enscands.remove(it);
            final NumberVector vec = relation.get(it);
            // Build combined ensemble.
            {
                double[] buf = new double[ensemble.size() + 1];
                for (int i = 0; i < dim; i++) {
                    int j = 0;
                    for (DBIDIter iter = ensemble.iter(); iter.valid(); iter.advance()) {
                        buf[j] = relation.get(iter).doubleValue(i);
                        j++;
                    }
                    buf[j] = vec.doubleValue(i);
                    testensemble[i] = voting.combine(buf, j + 1);
                }
            }
            applyScaling(testensemble, scaling);
            NumberVector testvec = DoubleVector.wrap(testensemble);
            double newd = wdist.distance(estimated_truth_vec, testvec);
            // labels.get(bestadd));
            if (newd < oldd) {
                System.arraycopy(testensemble, 0, greedyensemble, 0, dim);
                ensemble.add(it);
                // Recompute heap
                break;
            } else {
                dropped.add(it);
                // logger.verbose("Discarding: " + labels.get(bestadd));
                if (refine_truth) {
                    // Update target vectors and weights
                    ArrayList<DecreasingVectorIter> iters = new ArrayList<>(numcand);
                    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
                        // Skip "by label", obviously
                        if (DBIDUtil.equal(firstid, iditer) || dropped.contains(iditer)) {
                            continue;
                        }
                        iters.add(new DecreasingVectorIter(relation.get(iditer)));
                    }
                    if (minvote >= iters.size()) {
                        minvote = iters.size() - 1;
                    }
                    union_outliers = 0;
                    Arrays.fill(outliers_seen, 0);
                    while (union_outliers < desired_outliers) {
                        for (DecreasingVectorIter iter : iters) {
                            if (!iter.valid()) {
                                break;
                            }
                            int cur = iter.dim();
                            if (outliers_seen[cur] == 0) {
                                outliers_seen[cur] = 1;
                            } else {
                                outliers_seen[cur] += 1;
                            }
                            if (outliers_seen[cur] == minvote) {
                                union_outliers += 1;
                            }
                            iter.advance();
                        }
                    }
                    LOG.warning("New num outliers: " + union_outliers);
                    updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
                    estimated_truth_vec = DoubleVector.wrap(estimated_truth);
                }
            }
        }
    }
    // Build the improved ensemble:
    StringBuilder greedylbl = new StringBuilder();
    {
        for (DBIDIter iter = ensemble.iter(); iter.valid(); iter.advance()) {
            if (greedylbl.length() > 0) {
                greedylbl.append(' ');
            }
            greedylbl.append(labels.get(iter));
        }
    }
    DoubleVector greedyvec = DoubleVector.wrap(greedyensemble);
    if (refine_truth) {
        LOG.verbose("Estimated outliers remaining: " + union_outliers);
    }
    LOG.verbose("Greedy ensemble (" + ensemble.size() + "): " + greedylbl.toString());
    LOG.verbose("Best single ROC AUC: " + bestauc + " (" + bestaucstr + ")");
    LOG.verbose("Best single cost:    " + bestcost + " (" + bestcoststr + ")");
    // Evaluate the naive ensemble and the "shrunk" ensemble
    double naiveauc, naivecost;
    {
        naiveauc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(naivevec));
        naivecost = tdist.distance(naivevec, refvec);
        LOG.verbose("Naive ensemble AUC:   " + naiveauc + " cost: " + naivecost);
        LOG.verbose("Naive ensemble Gain:  " + gain(naiveauc, bestauc, 1) + " cost gain: " + gain(naivecost, bestcost, 0));
    }
    double greedyauc, greedycost;
    {
        greedyauc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(greedyvec));
        greedycost = tdist.distance(greedyvec, refvec);
        LOG.verbose("Greedy ensemble AUC:  " + greedyauc + " cost: " + greedycost);
        LOG.verbose("Greedy ensemble Gain to best:  " + gain(greedyauc, bestauc, 1) + " cost gain: " + gain(greedycost, bestcost, 0));
        LOG.verbose("Greedy ensemble Gain to naive: " + gain(greedyauc, naiveauc, 1) + " cost gain: " + gain(greedycost, naivecost, 0));
    }
    {
        MeanVariance meanauc = new MeanVariance();
        MeanVariance meancost = new MeanVariance();
        HashSetModifiableDBIDs candidates = DBIDUtil.newHashSet(relation.getDBIDs());
        candidates.remove(firstid);
        for (int i = 0; i < 1000; i++) {
            // Build the improved ensemble:
            final double[] randomensemble = new double[dim];
            {
                DBIDs random = DBIDUtil.randomSample(candidates, ensemble.size(), (long) i);
                double[] buf = new double[random.size()];
                for (int d = 0; d < dim; d++) {
                    int j = 0;
                    for (DBIDIter iter = random.iter(); iter.valid(); iter.advance()) {
                        assert (!DBIDUtil.equal(firstid, iter));
                        final NumberVector vec = relation.get(iter);
                        buf[j] = vec.doubleValue(d);
                        j++;
                    }
                    randomensemble[d] = voting.combine(buf, j);
                }
            }
            applyScaling(randomensemble, scaling);
            NumberVector randomvec = DoubleVector.wrap(randomensemble);
            double auc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(randomvec));
            meanauc.put(auc);
            double cost = tdist.distance(randomvec, refvec);
            meancost.put(cost);
        }
        LOG.verbose("Random ensemble AUC:  " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev()));
        LOG.verbose("Random ensemble Gain: " + gain(meanauc.getMean(), bestauc, 1));
        LOG.verbose("Greedy improvement:   " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations.");
        LOG.verbose("Random ensemble Cost: " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev()));
        LOG.verbose("Random ensemble Gain: " + gain(meancost.getMean(), bestcost, 0));
        LOG.verbose("Greedy improvement:   " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations.");
        LOG.verbose("Naive ensemble Gain to random: " + gain(naiveauc, meanauc.getMean(), 1) + " cost gain: " + gain(naivecost, meancost.getMean(), 0));
        LOG.verbose("Random ensemble Gain to naive: " + gain(meanauc.getMean(), naiveauc, 1) + " cost gain: " + gain(meancost.getMean(), naivecost, 0));
        LOG.verbose("Greedy ensemble Gain to random: " + gain(greedyauc, meanauc.getMean(), 1) + " cost gain: " + gain(greedycost, meancost.getMean(), 0));
    }
}
Also used : DecreasingVectorIter(de.lmu.ifi.dbs.elki.evaluation.scores.adapter.DecreasingVectorIter) DBID(de.lmu.ifi.dbs.elki.database.ids.DBID) ArrayList(java.util.ArrayList) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) Database(de.lmu.ifi.dbs.elki.database.Database) ModifiableDoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) DoubleDBIDListMIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListMIter) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException) VectorNonZero(de.lmu.ifi.dbs.elki.evaluation.scores.adapter.VectorNonZero)

Example 2 with DoubleDBIDListMIter

use of de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListMIter in project elki by elki-project.

the class KMeansMinusMinus method run.

@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
    if (relation.size() <= 0) {
        return new Clustering<>("k-Means Clustering", "kmeans-clustering");
    }
    // Choose initial means
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
    }
    // Intialisieren der means
    double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
    // initialisieren vom Heap
    final int heapsize = (int) (rate < 1. ? Math.ceil(relation.size() * rate) : rate);
    DoubleMinHeap minHeap = new DoubleMinHeap(heapsize);
    // Setup cluster assignment store
    List<ModifiableDoubleDBIDList> clusters = new ArrayList<>();
    for (int i = 0; i < k; i++) {
        clusters.add(DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
    }
    WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
    double[] varsum = new double[k];
    IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
    DoubleStatistic varstat = new DoubleStatistic(this.getClass().getName() + ".variance-sum");
    // Otherwise, the vartotal break below will fail!
    assert (varstat != null);
    int iteration = 0;
    double prevvartotal = Double.POSITIVE_INFINITY;
    for (; maxiter <= 0 || iteration < maxiter; iteration++) {
        minHeap.clear();
        for (int i = 0; i < k; i++) {
            clusters.get(i).clear();
        }
        LOG.incrementProcessed(prog);
        boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum, minHeap, heapsize);
        double vartotal = logVarstat(varstat, varsum);
        // than the previous value.
        if (!changed || vartotal > prevvartotal) {
            break;
        }
        prevvartotal = vartotal;
        // Recompute means.
        means = meansWithTreshhold(clusters, means, relation, heapsize > 0 ? minHeap.peek() : Double.POSITIVE_INFINITY);
    }
    // create noisecluster if wanted
    ModifiableDoubleDBIDList noiseids = null;
    if (noiseFlag && heapsize > 0) {
        clusters.add(noiseids = DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
        double tresh = minHeap.peek();
        for (int i = 0; i < k; i++) {
            for (DoubleDBIDListMIter it = clusters.get(i).iter(); it.valid(); it.advance()) {
                final double dist = it.doubleValue();
                // Add to the noise cluster:
                if (dist >= tresh) {
                    noiseids.add(dist, it);
                    assignment.putInt(it, k);
                    it.remove();
                }
            }
        }
    }
    LOG.setCompleted(prog);
    if (LOG.isStatistics()) {
        LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
    }
    // Wrap result
    Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
    for (int i = 0; i < k; i++) {
        DBIDs ids = clusters.get(i);
        if (ids.size() == 0) {
            continue;
        }
        KMeansModel model = new KMeansModel(means[i], varsum[i]);
        result.addToplevelCluster(new Cluster<>(ids, model));
    }
    // Noise Cluster
    if (noiseFlag) {
        KMeansModel model = new KMeansModel(null, 0);
        DBIDs ids = noiseids;
        if (ids.size() == 0) {
            return result;
        }
        result.addToplevelCluster(new Cluster<>(ids, true, model));
    }
    return result;
}
Also used : WritableIntegerDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore) KMeansModel(de.lmu.ifi.dbs.elki.data.model.KMeansModel) DoubleMinHeap(de.lmu.ifi.dbs.elki.utilities.datastructures.heap.DoubleMinHeap) ModifiableDoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) ArrayList(java.util.ArrayList) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) DoubleDBIDListMIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListMIter) IndefiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Aggregations

DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)2 DoubleDBIDListMIter (de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListMIter)2 ModifiableDoubleDBIDList (de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList)2 ArrayList (java.util.ArrayList)2 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)1 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)1 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)1 KMeansModel (de.lmu.ifi.dbs.elki.data.model.KMeansModel)1 Database (de.lmu.ifi.dbs.elki.database.Database)1 WritableIntegerDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableIntegerDataStore)1 DBID (de.lmu.ifi.dbs.elki.database.ids.DBID)1 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)1 HashSetModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs)1 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)1 DecreasingVectorIter (de.lmu.ifi.dbs.elki.evaluation.scores.adapter.DecreasingVectorIter)1 VectorNonZero (de.lmu.ifi.dbs.elki.evaluation.scores.adapter.VectorNonZero)1 IndefiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.IndefiniteProgress)1 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)1 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)1 StringStatistic (de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic)1