Examples with NumberVector - de.lmu.ifi.dbs.elki.data.NumberVector

Example 41 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class DiSH method findParent.

/**
 * Returns the parent of the specified cluster
 *
 * @param relation the relation storing the objects
 * @param child the child to search the parent for
 * @param clustersMap the map containing the clusters
 * @return the parent of the specified cluster
 */
private Pair<long[], ArrayModifiableDBIDs> findParent(Relation<V> relation, Pair<long[], ArrayModifiableDBIDs> child, Object2ObjectMap<long[], List<ArrayModifiableDBIDs>> clustersMap) {
    Centroid child_centroid = ProjectedCentroid.make(child.first, relation, child.second);
    Pair<long[], ArrayModifiableDBIDs> result = null;
    int resultCardinality = -1;
    long[] childPV = child.first;
    int childCardinality = BitsUtil.cardinality(childPV);
    for (long[] parentPV : clustersMap.keySet()) {
        int parentCardinality = BitsUtil.cardinality(parentPV);
        if (parentCardinality >= childCardinality) {
            continue;
        }
        if (resultCardinality != -1 && parentCardinality <= resultCardinality) {
            continue;
        }
        long[] pv = BitsUtil.andCMin(childPV, parentPV);
        if (BitsUtil.equal(pv, parentPV)) {
            List<ArrayModifiableDBIDs> parentList = clustersMap.get(parentPV);
            for (ArrayModifiableDBIDs parent : parentList) {
                NumberVector parent_centroid = ProjectedCentroid.make(parentPV, relation, parent);
                double d = weightedDistance(child_centroid, parent_centroid, parentPV);
                if (d <= 2 * epsilon) {
                    result = new Pair<>(parentPV, parent);
                    resultCardinality = parentCardinality;
                    break;
                }
            }
        }
    }
    return result;
}

Also used : Centroid(de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid) ProjectedCentroid(de.lmu.ifi.dbs.elki.math.linearalgebra.ProjectedCentroid) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector)

Example 42 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class DiSH method extractClusters.

/**
 * Extracts the clusters from the cluster order.
 *
 * @param relation the database storing the objects
 * @param clusterOrder the cluster order to extract the clusters from
 * @return the extracted clusters
 */
private Object2ObjectOpenCustomHashMap<long[], List<ArrayModifiableDBIDs>> extractClusters(Relation<V> relation, DiSHClusterOrder clusterOrder) {
    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Extract Clusters", relation.size(), LOG) : null;
    Object2ObjectOpenCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap = new Object2ObjectOpenCustomHashMap<>(BitsUtil.FASTUTIL_HASH_STRATEGY);
    // Note clusterOrder currently contains DBID objects anyway.
    WritableDataStore<Pair<long[], ArrayModifiableDBIDs>> entryToClusterMap = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, Pair.class);
    for (DBIDIter iter = clusterOrder.iter(); iter.valid(); iter.advance()) {
        V object = relation.get(iter);
        long[] preferenceVector = clusterOrder.getCommonPreferenceVector(iter);
        // get the list of (parallel) clusters for the preference vector
        List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(preferenceVector);
        if (parallelClusters == null) {
            parallelClusters = new ArrayList<>();
            clustersMap.put(preferenceVector, parallelClusters);
        }
        // look for the proper cluster
        ArrayModifiableDBIDs cluster = null;
        for (ArrayModifiableDBIDs c : parallelClusters) {
            NumberVector c_centroid = ProjectedCentroid.make(preferenceVector, relation, c);
            long[] commonPreferenceVector = BitsUtil.andCMin(preferenceVector, preferenceVector);
            int subspaceDim = subspaceDimensionality(object, c_centroid, preferenceVector, preferenceVector, commonPreferenceVector);
            if (subspaceDim == clusterOrder.getCorrelationValue(iter)) {
                double d = weightedDistance(object, c_centroid, commonPreferenceVector);
                if (d <= 2 * epsilon) {
                    cluster = c;
                    break;
                }
            }
        }
        if (cluster == null) {
            cluster = DBIDUtil.newArray();
            parallelClusters.add(cluster);
        }
        cluster.add(iter);
        entryToClusterMap.put(iter, new Pair<>(preferenceVector, cluster));
        LOG.incrementProcessed(progress);
    }
    LOG.ensureCompleted(progress);
    if (LOG.isDebuggingFiner()) {
        int dim = RelationUtil.dimensionality(relation);
        StringBuilder msg = new StringBuilder("Step 0");
        for (Map.Entry<long[], List<ArrayModifiableDBIDs>> clusterList : clustersMap.entrySet()) {
            for (ArrayModifiableDBIDs c : clusterList.getValue()) {
                msg.append('\n').append(BitsUtil.toStringLow(clusterList.getKey(), dim)).append(" ids ").append(c.size());
            }
        }
        LOG.debugFiner(msg.toString());
    }
    // add the predecessor to the cluster
    DBIDVar cur = DBIDUtil.newVar(), pre = DBIDUtil.newVar();
    for (long[] pv : clustersMap.keySet()) {
        List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv);
        for (ArrayModifiableDBIDs cluster : parallelClusters) {
            if (cluster.isEmpty()) {
                continue;
            }
            cluster.assignVar(0, cur);
            clusterOrder.getPredecessor(cur, pre);
            if (!pre.isSet() || DBIDUtil.equal(pre, cur)) {
                continue;
            }
            // parallel cluster
            if (BitsUtil.equal(clusterOrder.getCommonPreferenceVector(pre), clusterOrder.getCommonPreferenceVector(cur))) {
                continue;
            }
            if (// 
            clusterOrder.getCorrelationValue(pre) < clusterOrder.getCorrelationValue(cur) || clusterOrder.getReachability(pre) < clusterOrder.getReachability(cur)) {
                continue;
            }
            Pair<long[], ArrayModifiableDBIDs> oldCluster = entryToClusterMap.get(pre);
            oldCluster.second.remove(pre);
            cluster.add(pre);
            entryToClusterMap.put(pre, new Pair<>(pv, cluster));
        }
    }
    return clustersMap;
}

Also used : Object2ObjectOpenCustomHashMap(it.unimi.dsi.fastutil.objects.Object2ObjectOpenCustomHashMap) DBIDVar(de.lmu.ifi.dbs.elki.database.ids.DBIDVar) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) ArrayModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ArrayModifiableDBIDs) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) List(java.util.List) ArrayList(java.util.ArrayList) Object2ObjectMap(it.unimi.dsi.fastutil.objects.Object2ObjectMap) Map(java.util.Map) Object2ObjectOpenCustomHashMap(it.unimi.dsi.fastutil.objects.Object2ObjectOpenCustomHashMap) Pair(de.lmu.ifi.dbs.elki.utilities.pairs.Pair)

Example 43 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class DiSH method buildHierarchy.

/**
 * Builds the cluster hierarchy.
 *
 * @param clustering Clustering we process
 * @param clusters the sorted list of clusters
 * @param dimensionality the dimensionality of the data
 * @param database the database containing the data objects
 */
private void buildHierarchy(Relation<V> database, Clustering<SubspaceModel> clustering, List<Cluster<SubspaceModel>> clusters, int dimensionality) {
    StringBuilder msg = LOG.isDebugging() ? new StringBuilder() : null;
    final int db_dim = RelationUtil.dimensionality(database);
    Hierarchy<Cluster<SubspaceModel>> hier = clustering.getClusterHierarchy();
    for (int i = 0; i < clusters.size() - 1; i++) {
        Cluster<SubspaceModel> c_i = clusters.get(i);
        final Subspace s_i = c_i.getModel().getSubspace();
        int subspaceDim_i = dimensionality - s_i.dimensionality();
        NumberVector ci_centroid = ProjectedCentroid.make(s_i.getDimensions(), database, c_i.getIDs());
        long[] pv1 = s_i.getDimensions();
        for (int j = i + 1; j < clusters.size(); j++) {
            Cluster<SubspaceModel> c_j = clusters.get(j);
            final Subspace s_j = c_j.getModel().getSubspace();
            int subspaceDim_j = dimensionality - s_j.dimensionality();
            if (subspaceDim_i < subspaceDim_j) {
                if (msg != null) {
                    msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim)).append(']');
                    msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim)).append(']');
                }
                // noise level reached
                if (s_j.dimensionality() == 0) {
                    // no parents exists -> parent is noise
                    if (hier.numParents(c_i) == 0) {
                        clustering.addChildCluster(c_j, c_i);
                        if (msg != null) {
                            msg.append("\n [").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim));
                            msg.append("] is parent of [").append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim));
                            msg.append(']');
                        }
                    }
                } else {
                    NumberVector cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs());
                    long[] pv2 = s_j.getDimensions();
                    long[] commonPreferenceVector = BitsUtil.andCMin(pv1, pv2);
                    int subspaceDim = subspaceDimensionality(ci_centroid, cj_centroid, pv1, pv2, commonPreferenceVector);
                    double d = weightedDistance(ci_centroid, cj_centroid, commonPreferenceVector);
                    if (msg != null) {
                        msg.append("\n dist = ").append(subspaceDim);
                    }
                    if (subspaceDim == subspaceDim_j) {
                        if (msg != null) {
                            msg.append("\n d = ").append(d);
                        }
                        if (d <= 2 * epsilon) {
                            // existing parents
                            if (hier.numParents(c_i) == 0 || !isParent(database, c_j, hier.iterParents(c_i), db_dim)) {
                                clustering.addChildCluster(c_j, c_i);
                                if (msg != null) {
                                    msg.append("\n [").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim));
                                    msg.append("] is parent of [");
                                    msg.append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim));
                                    msg.append(']');
                                }
                            }
                        } else {
                            throw new RuntimeException("Should never happen: d = " + d);
                        }
                    }
                }
            }
        }
    }
    if (msg != null) {
        LOG.debug(msg.toString());
    }
}

Also used : NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) SubspaceModel(de.lmu.ifi.dbs.elki.data.model.SubspaceModel) Subspace(de.lmu.ifi.dbs.elki.data.Subspace) Cluster(de.lmu.ifi.dbs.elki.data.Cluster)

Example 44 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class KMLOutputHandler method writeClusteringResult.

private void writeClusteringResult(XMLStreamWriter xmlw, Clustering<Model> clustering, Database database) throws XMLStreamException {
    xmlw.writeStartDocument();
    xmlw.writeCharacters("\n");
    xmlw.writeStartElement("kml");
    xmlw.writeDefaultNamespace("http://earth.google.com/kml/2.2");
    xmlw.writeStartElement("Document");
    {
        // TODO: can we automatically generate more helpful data here?
        xmlw.writeStartElement("name");
        xmlw.writeCharacters("ELKI KML output for " + clustering.getLongName());
        // name
        xmlw.writeEndElement();
        writeNewlineOnDebug(xmlw);
        // TODO: e.g. list the settings in the description?
        xmlw.writeStartElement("description");
        xmlw.writeCharacters("ELKI KML output for " + clustering.getLongName());
        // description
        xmlw.writeEndElement();
        writeNewlineOnDebug(xmlw);
    }
    List<Cluster<Model>> clusters = clustering.getAllClusters();
    Relation<NumberVector> coords = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD_2D);
    List<Cluster<Model>> topc = clustering.getToplevelClusters();
    Hierarchy<Cluster<Model>> hier = clustering.getClusterHierarchy();
    Map<Object, DoubleObjPair<Polygon>> hullmap = new HashMap<>();
    for (Cluster<Model> clu : topc) {
        buildHullsRecursively(clu, hier, hullmap, coords);
    }
    {
        final double projarea = 360. * 180. * .01;
        // TODO: generate styles from color scheme
        Iterator<Cluster<Model>> it = clusters.iterator();
        for (int i = 0; it.hasNext(); i++) {
            Cluster<Model> clus = it.next();
            // This is a prime based magic number, to produce a colorful output
            Color col = Color.getHSBColor(i / 4.294967291f, 1.f, .5f);
            DoubleObjPair<Polygon> pair = hullmap.get(clus);
            // Approximate area (using bounding box)
            double hullarea = SpatialUtil.volume(pair.second);
            final double relativeArea = Math.max(1. - (hullarea / projarea), 0.);
            // final double relativeSize = pair.first / coords.size();
            final double opacity = .65 * FastMath.sqrt(relativeArea) + .1;
            xmlw.writeStartElement("Style");
            xmlw.writeAttribute("id", "s" + i);
            writeNewlineOnDebug(xmlw);
            {
                xmlw.writeStartElement("LineStyle");
                xmlw.writeStartElement("width");
                xmlw.writeCharacters("0");
                // width
                xmlw.writeEndElement();
                // LineStyle
                xmlw.writeEndElement();
            }
            writeNewlineOnDebug(xmlw);
            {
                xmlw.writeStartElement("PolyStyle");
                xmlw.writeStartElement("color");
                // KML uses AABBGGRR format!
                xmlw.writeCharacters(String.format("%02x%02x%02x%02x", (int) (255 * Math.min(.75, opacity)), col.getBlue(), col.getGreen(), col.getRed()));
                // color
                xmlw.writeEndElement();
                // out.writeStartElement("fill");
                // out.writeCharacters("1"); // Default 1
                // out.writeEndElement(); // fill
                xmlw.writeStartElement("outline");
                xmlw.writeCharacters("0");
                // outline
                xmlw.writeEndElement();
                // PolyStyle
                xmlw.writeEndElement();
            }
            writeNewlineOnDebug(xmlw);
            // Style
            xmlw.writeEndElement();
            writeNewlineOnDebug(xmlw);
        }
    }
    Cluster<?> ignore = topc.size() == 1 ? topc.get(0) : null;
    Iterator<Cluster<Model>> it = clusters.iterator();
    for (int cnum = 0; it.hasNext(); cnum++) {
        Cluster<?> c = it.next();
        // Ignore sole toplevel cluster (usually: noise)
        if (c == ignore) {
            continue;
        }
        Polygon p = hullmap.get(c).second;
        xmlw.writeStartElement("Placemark");
        {
            xmlw.writeStartElement("name");
            xmlw.writeCharacters(c.getNameAutomatic());
            // name
            xmlw.writeEndElement();
            xmlw.writeStartElement("description");
            xmlw.writeCData(makeDescription(c).toString());
            // description
            xmlw.writeEndElement();
            xmlw.writeStartElement("styleUrl");
            xmlw.writeCharacters("#s" + cnum);
            // styleUrl
            xmlw.writeEndElement();
        }
        {
            xmlw.writeStartElement("Polygon");
            writeNewlineOnDebug(xmlw);
            if (compat) {
                xmlw.writeStartElement("altitudeMode");
                xmlw.writeCharacters("relativeToGround");
                // close altitude mode
                xmlw.writeEndElement();
                writeNewlineOnDebug(xmlw);
            }
            {
                xmlw.writeStartElement("outerBoundaryIs");
                xmlw.writeStartElement("LinearRing");
                xmlw.writeStartElement("coordinates");
                // Reverse anti-clockwise polygons.
                boolean reverse = (p.testClockwise() >= 0);
                ArrayListIter<double[]> itp = p.iter();
                if (reverse) {
                    itp.seek(p.size() - 1);
                }
                while (itp.valid()) {
                    double[] v = itp.get();
                    xmlw.writeCharacters(FormatUtil.format(v, ","));
                    if (compat && (v.length == 2)) {
                        xmlw.writeCharacters(",100");
                    }
                    xmlw.writeCharacters(" ");
                    if (!reverse) {
                        itp.advance();
                    } else {
                        itp.retract();
                    }
                }
                // close coordinates
                xmlw.writeEndElement();
                // close LinearRing
                xmlw.writeEndElement();
                // close *BoundaryIs
                xmlw.writeEndElement();
            }
            writeNewlineOnDebug(xmlw);
            // Polygon
            xmlw.writeEndElement();
        }
        // Placemark
        xmlw.writeEndElement();
        writeNewlineOnDebug(xmlw);
    }
    // Document
    xmlw.writeEndElement();
    // kml
    xmlw.writeEndElement();
    xmlw.writeEndDocument();
}

Also used : HashMap(java.util.HashMap) Color(java.awt.Color) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) DoubleObjPair(de.lmu.ifi.dbs.elki.utilities.pairs.DoubleObjPair) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) Model(de.lmu.ifi.dbs.elki.data.model.Model) Iterator(java.util.Iterator) PolygonsObject(de.lmu.ifi.dbs.elki.data.spatial.PolygonsObject) Polygon(de.lmu.ifi.dbs.elki.data.spatial.Polygon)

Example 45 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class GreedyEnsembleExperiment method run.

@Override
public void run() {
    // Note: the database contains the *result vectors*, not the original data.
    final Database database = inputstep.getDatabase();
    Relation<NumberVector> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD);
    final Relation<String> labels = DatabaseUtil.guessLabelRepresentation(database);
    final DBID firstid = DBIDUtil.deref(labels.iterDBIDs());
    final String firstlabel = labels.get(firstid);
    if (!firstlabel.matches("bylabel")) {
        throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!");
    }
    relation = applyPrescaling(prescaling, relation, firstid);
    final int numcand = relation.size() - 1;
    // Dimensionality and reference vector
    final int dim = RelationUtil.dimensionality(relation);
    final NumberVector refvec = relation.get(firstid);
    // Build the positive index set for ROC AUC.
    VectorNonZero positive = new VectorNonZero(refvec);
    final int desired_outliers = (int) (rate * dim);
    int union_outliers = 0;
    final int[] outliers_seen = new int[dim];
    // Merge the top-k for each ensemble member, until we have enough
    // candidates.
    {
        int k = 0;
        ArrayList<DecreasingVectorIter> iters = new ArrayList<>(numcand);
        if (minvote >= numcand) {
            minvote = Math.max(1, numcand - 1);
        }
        for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
            // Skip "by label", obviously
            if (DBIDUtil.equal(firstid, iditer)) {
                continue;
            }
            iters.add(new DecreasingVectorIter(relation.get(iditer)));
        }
        loop: while (union_outliers < desired_outliers) {
            for (DecreasingVectorIter iter : iters) {
                if (!iter.valid()) {
                    LOG.warning("Union_outliers=" + union_outliers + " < desired_outliers=" + desired_outliers + " minvote=" + minvote);
                    break loop;
                }
                int cur = iter.dim();
                outliers_seen[cur] += 1;
                if (outliers_seen[cur] == minvote) {
                    union_outliers += 1;
                }
                iter.advance();
            }
            k++;
        }
        LOG.verbose("Merged top " + k + " outliers to: " + union_outliers + " outliers (desired: at least " + desired_outliers + ")");
    }
    // Build the final weight vector.
    final double[] estimated_weights = new double[dim];
    final double[] estimated_truth = new double[dim];
    updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
    DoubleVector estimated_truth_vec = DoubleVector.wrap(estimated_truth);
    PrimitiveDistanceFunction<NumberVector> wdist = getDistanceFunction(estimated_weights);
    PrimitiveDistanceFunction<NumberVector> tdist = wdist;
    // Build the naive ensemble:
    final double[] naiveensemble = new double[dim];
    {
        double[] buf = new double[numcand];
        for (int d = 0; d < dim; d++) {
            int i = 0;
            for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
                if (DBIDUtil.equal(firstid, iditer)) {
                    continue;
                }
                final NumberVector vec = relation.get(iditer);
                buf[i] = vec.doubleValue(d);
                i++;
            }
            naiveensemble[d] = voting.combine(buf, i);
            if (Double.isNaN(naiveensemble[d])) {
                LOG.warning("NaN after combining: " + FormatUtil.format(buf) + " i=" + i + " " + voting.toString());
            }
        }
    }
    DoubleVector naivevec = DoubleVector.wrap(naiveensemble);
    // Compute single AUC scores and estimations.
    // Remember the method most similar to the estimation
    double bestauc = 0.0;
    String bestaucstr = "";
    double bestcost = Double.POSITIVE_INFINITY;
    String bestcoststr = "";
    DBID bestid = null;
    double bestest = Double.POSITIVE_INFINITY;
    {
        final double[] greedyensemble = new double[dim];
        // Compute individual scores
        for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
            if (DBIDUtil.equal(firstid, iditer)) {
                continue;
            }
            // fout.append(labels.get(id));
            final NumberVector vec = relation.get(iditer);
            singleEnsemble(greedyensemble, vec);
            double auc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(DoubleVector.wrap(greedyensemble)));
            double estimated = wdist.distance(DoubleVector.wrap(greedyensemble), estimated_truth_vec);
            double cost = tdist.distance(DoubleVector.wrap(greedyensemble), refvec);
            LOG.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(iditer));
            if (auc > bestauc) {
                bestauc = auc;
                bestaucstr = labels.get(iditer);
            }
            if (cost < bestcost) {
                bestcost = cost;
                bestcoststr = labels.get(iditer);
            }
            if (estimated < bestest || bestid == null) {
                bestest = estimated;
                bestid = DBIDUtil.deref(iditer);
            }
        }
    }
    // Initialize ensemble with "best" method
    if (prescaling != null) {
        LOG.verbose("Input prescaling: " + prescaling);
    }
    LOG.verbose("Distance function: " + wdist);
    LOG.verbose("Ensemble voting: " + voting);
    if (scaling != null) {
        LOG.verbose("Ensemble rescaling: " + scaling);
    }
    LOG.verbose("Initial estimation of outliers: " + union_outliers);
    LOG.verbose("Initializing ensemble with: " + labels.get(bestid));
    ModifiableDBIDs ensemble = DBIDUtil.newArray(bestid);
    ModifiableDBIDs enscands = DBIDUtil.newHashSet(relation.getDBIDs());
    ModifiableDBIDs dropped = DBIDUtil.newHashSet(relation.size());
    dropped.add(firstid);
    enscands.remove(bestid);
    enscands.remove(firstid);
    final double[] greedyensemble = new double[dim];
    singleEnsemble(greedyensemble, relation.get(bestid));
    // Greedily grow the ensemble
    final double[] testensemble = new double[dim];
    while (enscands.size() > 0) {
        NumberVector greedyvec = DoubleVector.wrap(greedyensemble);
        final double oldd = wdist.distance(estimated_truth_vec, greedyvec);
        final int heapsize = enscands.size();
        ModifiableDoubleDBIDList heap = DBIDUtil.newDistanceDBIDList(heapsize);
        double[] tmp = new double[dim];
        for (DBIDIter iter = enscands.iter(); iter.valid(); iter.advance()) {
            final NumberVector vec = relation.get(iter);
            singleEnsemble(tmp, vec);
            double diversity = wdist.distance(DoubleVector.wrap(greedyensemble), greedyvec);
            heap.add(diversity, iter);
        }
        heap.sort();
        for (DoubleDBIDListMIter it = heap.iter(); heap.size() > 0; it.remove()) {
            // Last
            it.seek(heap.size() - 1);
            enscands.remove(it);
            final NumberVector vec = relation.get(it);
            // Build combined ensemble.
            {
                double[] buf = new double[ensemble.size() + 1];
                for (int i = 0; i < dim; i++) {
                    int j = 0;
                    for (DBIDIter iter = ensemble.iter(); iter.valid(); iter.advance()) {
                        buf[j] = relation.get(iter).doubleValue(i);
                        j++;
                    }
                    buf[j] = vec.doubleValue(i);
                    testensemble[i] = voting.combine(buf, j + 1);
                }
            }
            applyScaling(testensemble, scaling);
            NumberVector testvec = DoubleVector.wrap(testensemble);
            double newd = wdist.distance(estimated_truth_vec, testvec);
            // labels.get(bestadd));
            if (newd < oldd) {
                System.arraycopy(testensemble, 0, greedyensemble, 0, dim);
                ensemble.add(it);
                // Recompute heap
                break;
            } else {
                dropped.add(it);
                // logger.verbose("Discarding: " + labels.get(bestadd));
                if (refine_truth) {
                    // Update target vectors and weights
                    ArrayList<DecreasingVectorIter> iters = new ArrayList<>(numcand);
                    for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
                        // Skip "by label", obviously
                        if (DBIDUtil.equal(firstid, iditer) || dropped.contains(iditer)) {
                            continue;
                        }
                        iters.add(new DecreasingVectorIter(relation.get(iditer)));
                    }
                    if (minvote >= iters.size()) {
                        minvote = iters.size() - 1;
                    }
                    union_outliers = 0;
                    Arrays.fill(outliers_seen, 0);
                    while (union_outliers < desired_outliers) {
                        for (DecreasingVectorIter iter : iters) {
                            if (!iter.valid()) {
                                break;
                            }
                            int cur = iter.dim();
                            if (outliers_seen[cur] == 0) {
                                outliers_seen[cur] = 1;
                            } else {
                                outliers_seen[cur] += 1;
                            }
                            if (outliers_seen[cur] == minvote) {
                                union_outliers += 1;
                            }
                            iter.advance();
                        }
                    }
                    LOG.warning("New num outliers: " + union_outliers);
                    updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
                    estimated_truth_vec = DoubleVector.wrap(estimated_truth);
                }
            }
        }
    }
    // Build the improved ensemble:
    StringBuilder greedylbl = new StringBuilder();
    {
        for (DBIDIter iter = ensemble.iter(); iter.valid(); iter.advance()) {
            if (greedylbl.length() > 0) {
                greedylbl.append(' ');
            }
            greedylbl.append(labels.get(iter));
        }
    }
    DoubleVector greedyvec = DoubleVector.wrap(greedyensemble);
    if (refine_truth) {
        LOG.verbose("Estimated outliers remaining: " + union_outliers);
    }
    LOG.verbose("Greedy ensemble (" + ensemble.size() + "): " + greedylbl.toString());
    LOG.verbose("Best single ROC AUC: " + bestauc + " (" + bestaucstr + ")");
    LOG.verbose("Best single cost:    " + bestcost + " (" + bestcoststr + ")");
    // Evaluate the naive ensemble and the "shrunk" ensemble
    double naiveauc, naivecost;
    {
        naiveauc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(naivevec));
        naivecost = tdist.distance(naivevec, refvec);
        LOG.verbose("Naive ensemble AUC:   " + naiveauc + " cost: " + naivecost);
        LOG.verbose("Naive ensemble Gain:  " + gain(naiveauc, bestauc, 1) + " cost gain: " + gain(naivecost, bestcost, 0));
    }
    double greedyauc, greedycost;
    {
        greedyauc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(greedyvec));
        greedycost = tdist.distance(greedyvec, refvec);
        LOG.verbose("Greedy ensemble AUC:  " + greedyauc + " cost: " + greedycost);
        LOG.verbose("Greedy ensemble Gain to best:  " + gain(greedyauc, bestauc, 1) + " cost gain: " + gain(greedycost, bestcost, 0));
        LOG.verbose("Greedy ensemble Gain to naive: " + gain(greedyauc, naiveauc, 1) + " cost gain: " + gain(greedycost, naivecost, 0));
    }
    {
        MeanVariance meanauc = new MeanVariance();
        MeanVariance meancost = new MeanVariance();
        HashSetModifiableDBIDs candidates = DBIDUtil.newHashSet(relation.getDBIDs());
        candidates.remove(firstid);
        for (int i = 0; i < 1000; i++) {
            // Build the improved ensemble:
            final double[] randomensemble = new double[dim];
            {
                DBIDs random = DBIDUtil.randomSample(candidates, ensemble.size(), (long) i);
                double[] buf = new double[random.size()];
                for (int d = 0; d < dim; d++) {
                    int j = 0;
                    for (DBIDIter iter = random.iter(); iter.valid(); iter.advance()) {
                        assert (!DBIDUtil.equal(firstid, iter));
                        final NumberVector vec = relation.get(iter);
                        buf[j] = vec.doubleValue(d);
                        j++;
                    }
                    randomensemble[d] = voting.combine(buf, j);
                }
            }
            applyScaling(randomensemble, scaling);
            NumberVector randomvec = DoubleVector.wrap(randomensemble);
            double auc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(randomvec));
            meanauc.put(auc);
            double cost = tdist.distance(randomvec, refvec);
            meancost.put(cost);
        }
        LOG.verbose("Random ensemble AUC:  " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev()));
        LOG.verbose("Random ensemble Gain: " + gain(meanauc.getMean(), bestauc, 1));
        LOG.verbose("Greedy improvement:   " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations.");
        LOG.verbose("Random ensemble Cost: " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev()));
        LOG.verbose("Random ensemble Gain: " + gain(meancost.getMean(), bestcost, 0));
        LOG.verbose("Greedy improvement:   " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations.");
        LOG.verbose("Naive ensemble Gain to random: " + gain(naiveauc, meanauc.getMean(), 1) + " cost gain: " + gain(naivecost, meancost.getMean(), 0));
        LOG.verbose("Random ensemble Gain to naive: " + gain(meanauc.getMean(), naiveauc, 1) + " cost gain: " + gain(meancost.getMean(), naivecost, 0));
        LOG.verbose("Greedy ensemble Gain to random: " + gain(greedyauc, meanauc.getMean(), 1) + " cost gain: " + gain(greedycost, meancost.getMean(), 0));
    }
}

Also used : DecreasingVectorIter(de.lmu.ifi.dbs.elki.evaluation.scores.adapter.DecreasingVectorIter) DBID(de.lmu.ifi.dbs.elki.database.ids.DBID) ArrayList(java.util.ArrayList) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) Database(de.lmu.ifi.dbs.elki.database.Database) ModifiableDoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) DoubleDBIDListMIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListMIter) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException) VectorNonZero(de.lmu.ifi.dbs.elki.evaluation.scores.adapter.VectorNonZero)

Aggregations

NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)85 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)40 ArrayList (java.util.ArrayList)16 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)9 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)8 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)8 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)8 Database (de.lmu.ifi.dbs.elki.database.Database)7 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)7 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)7 Random (java.util.Random)7 Test (org.junit.Test)7 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)5 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)5 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)5 EvaluationResult (de.lmu.ifi.dbs.elki.result.EvaluationResult)5 MeasurementGroup (de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup)5 List (java.util.List)5 SparseNumberVector (de.lmu.ifi.dbs.elki.data.SparseNumberVector)4 RandomProjectionFamily (de.lmu.ifi.dbs.elki.data.projection.random.RandomProjectionFamily)4