use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class DiSH method findParent.
/**
* Returns the parent of the specified cluster
*
* @param relation the relation storing the objects
* @param child the child to search the parent for
* @param clustersMap the map containing the clusters
* @return the parent of the specified cluster
*/
private Pair<long[], ArrayModifiableDBIDs> findParent(Relation<V> relation, Pair<long[], ArrayModifiableDBIDs> child, Object2ObjectMap<long[], List<ArrayModifiableDBIDs>> clustersMap) {
Centroid child_centroid = ProjectedCentroid.make(child.first, relation, child.second);
Pair<long[], ArrayModifiableDBIDs> result = null;
int resultCardinality = -1;
long[] childPV = child.first;
int childCardinality = BitsUtil.cardinality(childPV);
for (long[] parentPV : clustersMap.keySet()) {
int parentCardinality = BitsUtil.cardinality(parentPV);
if (parentCardinality >= childCardinality) {
continue;
}
if (resultCardinality != -1 && parentCardinality <= resultCardinality) {
continue;
}
long[] pv = BitsUtil.andCMin(childPV, parentPV);
if (BitsUtil.equal(pv, parentPV)) {
List<ArrayModifiableDBIDs> parentList = clustersMap.get(parentPV);
for (ArrayModifiableDBIDs parent : parentList) {
NumberVector parent_centroid = ProjectedCentroid.make(parentPV, relation, parent);
double d = weightedDistance(child_centroid, parent_centroid, parentPV);
if (d <= 2 * epsilon) {
result = new Pair<>(parentPV, parent);
resultCardinality = parentCardinality;
break;
}
}
}
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class DiSH method extractClusters.
/**
* Extracts the clusters from the cluster order.
*
* @param relation the database storing the objects
* @param clusterOrder the cluster order to extract the clusters from
* @return the extracted clusters
*/
private Object2ObjectOpenCustomHashMap<long[], List<ArrayModifiableDBIDs>> extractClusters(Relation<V> relation, DiSHClusterOrder clusterOrder) {
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Extract Clusters", relation.size(), LOG) : null;
Object2ObjectOpenCustomHashMap<long[], List<ArrayModifiableDBIDs>> clustersMap = new Object2ObjectOpenCustomHashMap<>(BitsUtil.FASTUTIL_HASH_STRATEGY);
// Note clusterOrder currently contains DBID objects anyway.
WritableDataStore<Pair<long[], ArrayModifiableDBIDs>> entryToClusterMap = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, Pair.class);
for (DBIDIter iter = clusterOrder.iter(); iter.valid(); iter.advance()) {
V object = relation.get(iter);
long[] preferenceVector = clusterOrder.getCommonPreferenceVector(iter);
// get the list of (parallel) clusters for the preference vector
List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(preferenceVector);
if (parallelClusters == null) {
parallelClusters = new ArrayList<>();
clustersMap.put(preferenceVector, parallelClusters);
}
// look for the proper cluster
ArrayModifiableDBIDs cluster = null;
for (ArrayModifiableDBIDs c : parallelClusters) {
NumberVector c_centroid = ProjectedCentroid.make(preferenceVector, relation, c);
long[] commonPreferenceVector = BitsUtil.andCMin(preferenceVector, preferenceVector);
int subspaceDim = subspaceDimensionality(object, c_centroid, preferenceVector, preferenceVector, commonPreferenceVector);
if (subspaceDim == clusterOrder.getCorrelationValue(iter)) {
double d = weightedDistance(object, c_centroid, commonPreferenceVector);
if (d <= 2 * epsilon) {
cluster = c;
break;
}
}
}
if (cluster == null) {
cluster = DBIDUtil.newArray();
parallelClusters.add(cluster);
}
cluster.add(iter);
entryToClusterMap.put(iter, new Pair<>(preferenceVector, cluster));
LOG.incrementProcessed(progress);
}
LOG.ensureCompleted(progress);
if (LOG.isDebuggingFiner()) {
int dim = RelationUtil.dimensionality(relation);
StringBuilder msg = new StringBuilder("Step 0");
for (Map.Entry<long[], List<ArrayModifiableDBIDs>> clusterList : clustersMap.entrySet()) {
for (ArrayModifiableDBIDs c : clusterList.getValue()) {
msg.append('\n').append(BitsUtil.toStringLow(clusterList.getKey(), dim)).append(" ids ").append(c.size());
}
}
LOG.debugFiner(msg.toString());
}
// add the predecessor to the cluster
DBIDVar cur = DBIDUtil.newVar(), pre = DBIDUtil.newVar();
for (long[] pv : clustersMap.keySet()) {
List<ArrayModifiableDBIDs> parallelClusters = clustersMap.get(pv);
for (ArrayModifiableDBIDs cluster : parallelClusters) {
if (cluster.isEmpty()) {
continue;
}
cluster.assignVar(0, cur);
clusterOrder.getPredecessor(cur, pre);
if (!pre.isSet() || DBIDUtil.equal(pre, cur)) {
continue;
}
// parallel cluster
if (BitsUtil.equal(clusterOrder.getCommonPreferenceVector(pre), clusterOrder.getCommonPreferenceVector(cur))) {
continue;
}
if (//
clusterOrder.getCorrelationValue(pre) < clusterOrder.getCorrelationValue(cur) || clusterOrder.getReachability(pre) < clusterOrder.getReachability(cur)) {
continue;
}
Pair<long[], ArrayModifiableDBIDs> oldCluster = entryToClusterMap.get(pre);
oldCluster.second.remove(pre);
cluster.add(pre);
entryToClusterMap.put(pre, new Pair<>(pv, cluster));
}
}
return clustersMap;
}
use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class DiSH method buildHierarchy.
/**
* Builds the cluster hierarchy.
*
* @param clustering Clustering we process
* @param clusters the sorted list of clusters
* @param dimensionality the dimensionality of the data
* @param database the database containing the data objects
*/
private void buildHierarchy(Relation<V> database, Clustering<SubspaceModel> clustering, List<Cluster<SubspaceModel>> clusters, int dimensionality) {
StringBuilder msg = LOG.isDebugging() ? new StringBuilder() : null;
final int db_dim = RelationUtil.dimensionality(database);
Hierarchy<Cluster<SubspaceModel>> hier = clustering.getClusterHierarchy();
for (int i = 0; i < clusters.size() - 1; i++) {
Cluster<SubspaceModel> c_i = clusters.get(i);
final Subspace s_i = c_i.getModel().getSubspace();
int subspaceDim_i = dimensionality - s_i.dimensionality();
NumberVector ci_centroid = ProjectedCentroid.make(s_i.getDimensions(), database, c_i.getIDs());
long[] pv1 = s_i.getDimensions();
for (int j = i + 1; j < clusters.size(); j++) {
Cluster<SubspaceModel> c_j = clusters.get(j);
final Subspace s_j = c_j.getModel().getSubspace();
int subspaceDim_j = dimensionality - s_j.dimensionality();
if (subspaceDim_i < subspaceDim_j) {
if (msg != null) {
msg.append("\n l_i=").append(subspaceDim_i).append(" pv_i=[").append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim)).append(']');
msg.append("\n l_j=").append(subspaceDim_j).append(" pv_j=[").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim)).append(']');
}
// noise level reached
if (s_j.dimensionality() == 0) {
// no parents exists -> parent is noise
if (hier.numParents(c_i) == 0) {
clustering.addChildCluster(c_j, c_i);
if (msg != null) {
msg.append("\n [").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim));
msg.append("] is parent of [").append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim));
msg.append(']');
}
}
} else {
NumberVector cj_centroid = ProjectedCentroid.make(c_j.getModel().getDimensions(), database, c_j.getIDs());
long[] pv2 = s_j.getDimensions();
long[] commonPreferenceVector = BitsUtil.andCMin(pv1, pv2);
int subspaceDim = subspaceDimensionality(ci_centroid, cj_centroid, pv1, pv2, commonPreferenceVector);
double d = weightedDistance(ci_centroid, cj_centroid, commonPreferenceVector);
if (msg != null) {
msg.append("\n dist = ").append(subspaceDim);
}
if (subspaceDim == subspaceDim_j) {
if (msg != null) {
msg.append("\n d = ").append(d);
}
if (d <= 2 * epsilon) {
// existing parents
if (hier.numParents(c_i) == 0 || !isParent(database, c_j, hier.iterParents(c_i), db_dim)) {
clustering.addChildCluster(c_j, c_i);
if (msg != null) {
msg.append("\n [").append(BitsUtil.toStringLow(s_j.getDimensions(), db_dim));
msg.append("] is parent of [");
msg.append(BitsUtil.toStringLow(s_i.getDimensions(), db_dim));
msg.append(']');
}
}
} else {
throw new RuntimeException("Should never happen: d = " + d);
}
}
}
}
}
}
if (msg != null) {
LOG.debug(msg.toString());
}
}
use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class KMLOutputHandler method writeClusteringResult.
private void writeClusteringResult(XMLStreamWriter xmlw, Clustering<Model> clustering, Database database) throws XMLStreamException {
xmlw.writeStartDocument();
xmlw.writeCharacters("\n");
xmlw.writeStartElement("kml");
xmlw.writeDefaultNamespace("http://earth.google.com/kml/2.2");
xmlw.writeStartElement("Document");
{
// TODO: can we automatically generate more helpful data here?
xmlw.writeStartElement("name");
xmlw.writeCharacters("ELKI KML output for " + clustering.getLongName());
// name
xmlw.writeEndElement();
writeNewlineOnDebug(xmlw);
// TODO: e.g. list the settings in the description?
xmlw.writeStartElement("description");
xmlw.writeCharacters("ELKI KML output for " + clustering.getLongName());
// description
xmlw.writeEndElement();
writeNewlineOnDebug(xmlw);
}
List<Cluster<Model>> clusters = clustering.getAllClusters();
Relation<NumberVector> coords = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD_2D);
List<Cluster<Model>> topc = clustering.getToplevelClusters();
Hierarchy<Cluster<Model>> hier = clustering.getClusterHierarchy();
Map<Object, DoubleObjPair<Polygon>> hullmap = new HashMap<>();
for (Cluster<Model> clu : topc) {
buildHullsRecursively(clu, hier, hullmap, coords);
}
{
final double projarea = 360. * 180. * .01;
// TODO: generate styles from color scheme
Iterator<Cluster<Model>> it = clusters.iterator();
for (int i = 0; it.hasNext(); i++) {
Cluster<Model> clus = it.next();
// This is a prime based magic number, to produce a colorful output
Color col = Color.getHSBColor(i / 4.294967291f, 1.f, .5f);
DoubleObjPair<Polygon> pair = hullmap.get(clus);
// Approximate area (using bounding box)
double hullarea = SpatialUtil.volume(pair.second);
final double relativeArea = Math.max(1. - (hullarea / projarea), 0.);
// final double relativeSize = pair.first / coords.size();
final double opacity = .65 * FastMath.sqrt(relativeArea) + .1;
xmlw.writeStartElement("Style");
xmlw.writeAttribute("id", "s" + i);
writeNewlineOnDebug(xmlw);
{
xmlw.writeStartElement("LineStyle");
xmlw.writeStartElement("width");
xmlw.writeCharacters("0");
// width
xmlw.writeEndElement();
// LineStyle
xmlw.writeEndElement();
}
writeNewlineOnDebug(xmlw);
{
xmlw.writeStartElement("PolyStyle");
xmlw.writeStartElement("color");
// KML uses AABBGGRR format!
xmlw.writeCharacters(String.format("%02x%02x%02x%02x", (int) (255 * Math.min(.75, opacity)), col.getBlue(), col.getGreen(), col.getRed()));
// color
xmlw.writeEndElement();
// out.writeStartElement("fill");
// out.writeCharacters("1"); // Default 1
// out.writeEndElement(); // fill
xmlw.writeStartElement("outline");
xmlw.writeCharacters("0");
// outline
xmlw.writeEndElement();
// PolyStyle
xmlw.writeEndElement();
}
writeNewlineOnDebug(xmlw);
// Style
xmlw.writeEndElement();
writeNewlineOnDebug(xmlw);
}
}
Cluster<?> ignore = topc.size() == 1 ? topc.get(0) : null;
Iterator<Cluster<Model>> it = clusters.iterator();
for (int cnum = 0; it.hasNext(); cnum++) {
Cluster<?> c = it.next();
// Ignore sole toplevel cluster (usually: noise)
if (c == ignore) {
continue;
}
Polygon p = hullmap.get(c).second;
xmlw.writeStartElement("Placemark");
{
xmlw.writeStartElement("name");
xmlw.writeCharacters(c.getNameAutomatic());
// name
xmlw.writeEndElement();
xmlw.writeStartElement("description");
xmlw.writeCData(makeDescription(c).toString());
// description
xmlw.writeEndElement();
xmlw.writeStartElement("styleUrl");
xmlw.writeCharacters("#s" + cnum);
// styleUrl
xmlw.writeEndElement();
}
{
xmlw.writeStartElement("Polygon");
writeNewlineOnDebug(xmlw);
if (compat) {
xmlw.writeStartElement("altitudeMode");
xmlw.writeCharacters("relativeToGround");
// close altitude mode
xmlw.writeEndElement();
writeNewlineOnDebug(xmlw);
}
{
xmlw.writeStartElement("outerBoundaryIs");
xmlw.writeStartElement("LinearRing");
xmlw.writeStartElement("coordinates");
// Reverse anti-clockwise polygons.
boolean reverse = (p.testClockwise() >= 0);
ArrayListIter<double[]> itp = p.iter();
if (reverse) {
itp.seek(p.size() - 1);
}
while (itp.valid()) {
double[] v = itp.get();
xmlw.writeCharacters(FormatUtil.format(v, ","));
if (compat && (v.length == 2)) {
xmlw.writeCharacters(",100");
}
xmlw.writeCharacters(" ");
if (!reverse) {
itp.advance();
} else {
itp.retract();
}
}
// close coordinates
xmlw.writeEndElement();
// close LinearRing
xmlw.writeEndElement();
// close *BoundaryIs
xmlw.writeEndElement();
}
writeNewlineOnDebug(xmlw);
// Polygon
xmlw.writeEndElement();
}
// Placemark
xmlw.writeEndElement();
writeNewlineOnDebug(xmlw);
}
// Document
xmlw.writeEndElement();
// kml
xmlw.writeEndElement();
xmlw.writeEndDocument();
}
use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class GreedyEnsembleExperiment method run.
@Override
public void run() {
// Note: the database contains the *result vectors*, not the original data.
final Database database = inputstep.getDatabase();
Relation<NumberVector> relation = database.getRelation(TypeUtil.NUMBER_VECTOR_FIELD);
final Relation<String> labels = DatabaseUtil.guessLabelRepresentation(database);
final DBID firstid = DBIDUtil.deref(labels.iterDBIDs());
final String firstlabel = labels.get(firstid);
if (!firstlabel.matches("bylabel")) {
throw new AbortException("No 'by label' reference outlier found, which is needed for weighting!");
}
relation = applyPrescaling(prescaling, relation, firstid);
final int numcand = relation.size() - 1;
// Dimensionality and reference vector
final int dim = RelationUtil.dimensionality(relation);
final NumberVector refvec = relation.get(firstid);
// Build the positive index set for ROC AUC.
VectorNonZero positive = new VectorNonZero(refvec);
final int desired_outliers = (int) (rate * dim);
int union_outliers = 0;
final int[] outliers_seen = new int[dim];
// Merge the top-k for each ensemble member, until we have enough
// candidates.
{
int k = 0;
ArrayList<DecreasingVectorIter> iters = new ArrayList<>(numcand);
if (minvote >= numcand) {
minvote = Math.max(1, numcand - 1);
}
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
// Skip "by label", obviously
if (DBIDUtil.equal(firstid, iditer)) {
continue;
}
iters.add(new DecreasingVectorIter(relation.get(iditer)));
}
loop: while (union_outliers < desired_outliers) {
for (DecreasingVectorIter iter : iters) {
if (!iter.valid()) {
LOG.warning("Union_outliers=" + union_outliers + " < desired_outliers=" + desired_outliers + " minvote=" + minvote);
break loop;
}
int cur = iter.dim();
outliers_seen[cur] += 1;
if (outliers_seen[cur] == minvote) {
union_outliers += 1;
}
iter.advance();
}
k++;
}
LOG.verbose("Merged top " + k + " outliers to: " + union_outliers + " outliers (desired: at least " + desired_outliers + ")");
}
// Build the final weight vector.
final double[] estimated_weights = new double[dim];
final double[] estimated_truth = new double[dim];
updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
DoubleVector estimated_truth_vec = DoubleVector.wrap(estimated_truth);
PrimitiveDistanceFunction<NumberVector> wdist = getDistanceFunction(estimated_weights);
PrimitiveDistanceFunction<NumberVector> tdist = wdist;
// Build the naive ensemble:
final double[] naiveensemble = new double[dim];
{
double[] buf = new double[numcand];
for (int d = 0; d < dim; d++) {
int i = 0;
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
if (DBIDUtil.equal(firstid, iditer)) {
continue;
}
final NumberVector vec = relation.get(iditer);
buf[i] = vec.doubleValue(d);
i++;
}
naiveensemble[d] = voting.combine(buf, i);
if (Double.isNaN(naiveensemble[d])) {
LOG.warning("NaN after combining: " + FormatUtil.format(buf) + " i=" + i + " " + voting.toString());
}
}
}
DoubleVector naivevec = DoubleVector.wrap(naiveensemble);
// Compute single AUC scores and estimations.
// Remember the method most similar to the estimation
double bestauc = 0.0;
String bestaucstr = "";
double bestcost = Double.POSITIVE_INFINITY;
String bestcoststr = "";
DBID bestid = null;
double bestest = Double.POSITIVE_INFINITY;
{
final double[] greedyensemble = new double[dim];
// Compute individual scores
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
if (DBIDUtil.equal(firstid, iditer)) {
continue;
}
// fout.append(labels.get(id));
final NumberVector vec = relation.get(iditer);
singleEnsemble(greedyensemble, vec);
double auc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(DoubleVector.wrap(greedyensemble)));
double estimated = wdist.distance(DoubleVector.wrap(greedyensemble), estimated_truth_vec);
double cost = tdist.distance(DoubleVector.wrap(greedyensemble), refvec);
LOG.verbose("ROC AUC: " + auc + " estimated " + estimated + " cost " + cost + " " + labels.get(iditer));
if (auc > bestauc) {
bestauc = auc;
bestaucstr = labels.get(iditer);
}
if (cost < bestcost) {
bestcost = cost;
bestcoststr = labels.get(iditer);
}
if (estimated < bestest || bestid == null) {
bestest = estimated;
bestid = DBIDUtil.deref(iditer);
}
}
}
// Initialize ensemble with "best" method
if (prescaling != null) {
LOG.verbose("Input prescaling: " + prescaling);
}
LOG.verbose("Distance function: " + wdist);
LOG.verbose("Ensemble voting: " + voting);
if (scaling != null) {
LOG.verbose("Ensemble rescaling: " + scaling);
}
LOG.verbose("Initial estimation of outliers: " + union_outliers);
LOG.verbose("Initializing ensemble with: " + labels.get(bestid));
ModifiableDBIDs ensemble = DBIDUtil.newArray(bestid);
ModifiableDBIDs enscands = DBIDUtil.newHashSet(relation.getDBIDs());
ModifiableDBIDs dropped = DBIDUtil.newHashSet(relation.size());
dropped.add(firstid);
enscands.remove(bestid);
enscands.remove(firstid);
final double[] greedyensemble = new double[dim];
singleEnsemble(greedyensemble, relation.get(bestid));
// Greedily grow the ensemble
final double[] testensemble = new double[dim];
while (enscands.size() > 0) {
NumberVector greedyvec = DoubleVector.wrap(greedyensemble);
final double oldd = wdist.distance(estimated_truth_vec, greedyvec);
final int heapsize = enscands.size();
ModifiableDoubleDBIDList heap = DBIDUtil.newDistanceDBIDList(heapsize);
double[] tmp = new double[dim];
for (DBIDIter iter = enscands.iter(); iter.valid(); iter.advance()) {
final NumberVector vec = relation.get(iter);
singleEnsemble(tmp, vec);
double diversity = wdist.distance(DoubleVector.wrap(greedyensemble), greedyvec);
heap.add(diversity, iter);
}
heap.sort();
for (DoubleDBIDListMIter it = heap.iter(); heap.size() > 0; it.remove()) {
// Last
it.seek(heap.size() - 1);
enscands.remove(it);
final NumberVector vec = relation.get(it);
// Build combined ensemble.
{
double[] buf = new double[ensemble.size() + 1];
for (int i = 0; i < dim; i++) {
int j = 0;
for (DBIDIter iter = ensemble.iter(); iter.valid(); iter.advance()) {
buf[j] = relation.get(iter).doubleValue(i);
j++;
}
buf[j] = vec.doubleValue(i);
testensemble[i] = voting.combine(buf, j + 1);
}
}
applyScaling(testensemble, scaling);
NumberVector testvec = DoubleVector.wrap(testensemble);
double newd = wdist.distance(estimated_truth_vec, testvec);
// labels.get(bestadd));
if (newd < oldd) {
System.arraycopy(testensemble, 0, greedyensemble, 0, dim);
ensemble.add(it);
// Recompute heap
break;
} else {
dropped.add(it);
// logger.verbose("Discarding: " + labels.get(bestadd));
if (refine_truth) {
// Update target vectors and weights
ArrayList<DecreasingVectorIter> iters = new ArrayList<>(numcand);
for (DBIDIter iditer = relation.iterDBIDs(); iditer.valid(); iditer.advance()) {
// Skip "by label", obviously
if (DBIDUtil.equal(firstid, iditer) || dropped.contains(iditer)) {
continue;
}
iters.add(new DecreasingVectorIter(relation.get(iditer)));
}
if (minvote >= iters.size()) {
minvote = iters.size() - 1;
}
union_outliers = 0;
Arrays.fill(outliers_seen, 0);
while (union_outliers < desired_outliers) {
for (DecreasingVectorIter iter : iters) {
if (!iter.valid()) {
break;
}
int cur = iter.dim();
if (outliers_seen[cur] == 0) {
outliers_seen[cur] = 1;
} else {
outliers_seen[cur] += 1;
}
if (outliers_seen[cur] == minvote) {
union_outliers += 1;
}
iter.advance();
}
}
LOG.warning("New num outliers: " + union_outliers);
updateEstimations(outliers_seen, union_outliers, estimated_weights, estimated_truth);
estimated_truth_vec = DoubleVector.wrap(estimated_truth);
}
}
}
}
// Build the improved ensemble:
StringBuilder greedylbl = new StringBuilder();
{
for (DBIDIter iter = ensemble.iter(); iter.valid(); iter.advance()) {
if (greedylbl.length() > 0) {
greedylbl.append(' ');
}
greedylbl.append(labels.get(iter));
}
}
DoubleVector greedyvec = DoubleVector.wrap(greedyensemble);
if (refine_truth) {
LOG.verbose("Estimated outliers remaining: " + union_outliers);
}
LOG.verbose("Greedy ensemble (" + ensemble.size() + "): " + greedylbl.toString());
LOG.verbose("Best single ROC AUC: " + bestauc + " (" + bestaucstr + ")");
LOG.verbose("Best single cost: " + bestcost + " (" + bestcoststr + ")");
// Evaluate the naive ensemble and the "shrunk" ensemble
double naiveauc, naivecost;
{
naiveauc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(naivevec));
naivecost = tdist.distance(naivevec, refvec);
LOG.verbose("Naive ensemble AUC: " + naiveauc + " cost: " + naivecost);
LOG.verbose("Naive ensemble Gain: " + gain(naiveauc, bestauc, 1) + " cost gain: " + gain(naivecost, bestcost, 0));
}
double greedyauc, greedycost;
{
greedyauc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(greedyvec));
greedycost = tdist.distance(greedyvec, refvec);
LOG.verbose("Greedy ensemble AUC: " + greedyauc + " cost: " + greedycost);
LOG.verbose("Greedy ensemble Gain to best: " + gain(greedyauc, bestauc, 1) + " cost gain: " + gain(greedycost, bestcost, 0));
LOG.verbose("Greedy ensemble Gain to naive: " + gain(greedyauc, naiveauc, 1) + " cost gain: " + gain(greedycost, naivecost, 0));
}
{
MeanVariance meanauc = new MeanVariance();
MeanVariance meancost = new MeanVariance();
HashSetModifiableDBIDs candidates = DBIDUtil.newHashSet(relation.getDBIDs());
candidates.remove(firstid);
for (int i = 0; i < 1000; i++) {
// Build the improved ensemble:
final double[] randomensemble = new double[dim];
{
DBIDs random = DBIDUtil.randomSample(candidates, ensemble.size(), (long) i);
double[] buf = new double[random.size()];
for (int d = 0; d < dim; d++) {
int j = 0;
for (DBIDIter iter = random.iter(); iter.valid(); iter.advance()) {
assert (!DBIDUtil.equal(firstid, iter));
final NumberVector vec = relation.get(iter);
buf[j] = vec.doubleValue(d);
j++;
}
randomensemble[d] = voting.combine(buf, j);
}
}
applyScaling(randomensemble, scaling);
NumberVector randomvec = DoubleVector.wrap(randomensemble);
double auc = ROCEvaluation.computeROCAUC(positive, new DecreasingVectorIter(randomvec));
meanauc.put(auc);
double cost = tdist.distance(randomvec, refvec);
meancost.put(cost);
}
LOG.verbose("Random ensemble AUC: " + meanauc.getMean() + " + stddev: " + meanauc.getSampleStddev() + " = " + (meanauc.getMean() + meanauc.getSampleStddev()));
LOG.verbose("Random ensemble Gain: " + gain(meanauc.getMean(), bestauc, 1));
LOG.verbose("Greedy improvement: " + (greedyauc - meanauc.getMean()) / meanauc.getSampleStddev() + " standard deviations.");
LOG.verbose("Random ensemble Cost: " + meancost.getMean() + " + stddev: " + meancost.getSampleStddev() + " = " + (meancost.getMean() + meanauc.getSampleStddev()));
LOG.verbose("Random ensemble Gain: " + gain(meancost.getMean(), bestcost, 0));
LOG.verbose("Greedy improvement: " + (meancost.getMean() - greedycost) / meancost.getSampleStddev() + " standard deviations.");
LOG.verbose("Naive ensemble Gain to random: " + gain(naiveauc, meanauc.getMean(), 1) + " cost gain: " + gain(naivecost, meancost.getMean(), 0));
LOG.verbose("Random ensemble Gain to naive: " + gain(meanauc.getMean(), naiveauc, 1) + " cost gain: " + gain(meancost.getMean(), naivecost, 0));
LOG.verbose("Greedy ensemble Gain to random: " + gain(greedyauc, meanauc.getMean(), 1) + " cost gain: " + gain(greedycost, meancost.getMean(), 0));
}
}
Aggregations