Search in sources :

Example 31 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class FilterUtil method guessFactory.

/**
 * Try to guess the appropriate factory.
 *
 * @param in Input type
 * @param <V> Vector type
 * @return Factory
 */
@SuppressWarnings("unchecked")
public static <V extends NumberVector> NumberVector.Factory<V> guessFactory(SimpleTypeInformation<V> in) {
    NumberVector.Factory<V> factory = null;
    if (in instanceof VectorTypeInformation) {
        factory = (NumberVector.Factory<V>) ((VectorTypeInformation<V>) in).getFactory();
    }
    if (factory == null) {
        // FIXME: hack. Add factories to simple type information, too?
        try {
            Field f = in.getRestrictionClass().getField("FACTORY");
            factory = (NumberVector.Factory<V>) f.get(null);
        } catch (Exception e) {
            LoggingUtil.warning("Cannot determine factory for type " + in.getRestrictionClass(), e);
        }
    }
    return factory;
}
Also used : Field(java.lang.reflect.Field) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) VectorTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation)

Example 32 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class EvaluatePrecomputedOutlierScores method run.

@Override
public void run() {
    try (// 
    FileInputStream fis = new FileInputStream(infile);
        // 
        InputStream is = new BufferedInputStream(FileUtil.tryGzipInput(fis));
        FileOutputStream fosResult = new FileOutputStream(outfile, true);
        PrintStream fout = new PrintStream(fosResult);
        FileChannel chan = fosResult.getChannel()) {
        // Setup the input stream.
        parser.initStream(is);
        // Lock the output file:
        chan.lock();
        if (chan.position() == 0L) {
            writeHeader(fout);
        } else {
            LOG.info("Appending to existing output " + outfile);
        }
        int lcol = -1, dcol = -1;
        loop: while (true) {
            BundleStreamSource.Event ev = parser.nextEvent();
            switch(ev) {
                case END_OF_STREAM:
                    break loop;
                case META_CHANGED:
                    BundleMeta meta = parser.getMeta();
                    lcol = -1;
                    dcol = -1;
                    for (int i = 0; i < meta.size(); i++) {
                        SimpleTypeInformation<?> m = meta.get(i);
                        if (TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH.isAssignableFromType(m)) {
                            if (dcol >= 0) {
                                throw new AbortException("More than one vector column.");
                            }
                            dcol = i;
                        } else if (TypeUtil.GUESSED_LABEL.isAssignableFromType(m)) {
                            if (lcol >= 0) {
                                throw new AbortException("More than one label column.");
                            }
                            lcol = i;
                        } else {
                            throw new AbortException("Unexpected data column type: " + m);
                        }
                    }
                    break;
                case NEXT_OBJECT:
                    if (lcol < 0) {
                        throw new AbortException("No label column available.");
                    }
                    if (dcol < 0) {
                        throw new AbortException("No vector column available.");
                    }
                    processRow(fout, (NumberVector) parser.data(dcol), parser.data(lcol).toString());
                    break;
            }
        }
    } catch (IOException e) {
        throw new AbortException("IO error.", e);
    }
}
Also used : BundleMeta(de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta) FileChannel(java.nio.channels.FileChannel) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 33 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class GreedyEnsembleExperiment method applyPrescaling.

/**
 * Prescale each vector (except when in {@code skip}) with the given scaling
 * function.
 *
 * @param scaling Scaling function
 * @param relation Relation to read
 * @param skip DBIDs to pass unmodified
 * @return New relation
 */
public static Relation<NumberVector> applyPrescaling(ScalingFunction scaling, Relation<NumberVector> relation, DBIDs skip) {
    if (scaling == null) {
        return relation;
    }
    NumberVector.Factory<NumberVector> factory = RelationUtil.getNumberVectorFactory(relation);
    DBIDs ids = relation.getDBIDs();
    WritableDataStore<NumberVector> contents = DataStoreUtil.makeStorage(ids, DataStoreFactory.HINT_HOT, NumberVector.class);
    for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
        NumberVector v = relation.get(iter);
        double[] raw = v.toArray();
        if (!skip.contains(iter)) {
            applyScaling(raw, scaling);
        }
        contents.put(iter, factory.newNumberVector(raw, ArrayLikeUtil.DOUBLEARRAYADAPTER));
    }
    return new MaterializedRelation<>(relation.getDataTypeInformation(), ids, "rescaled", contents);
}
Also used : NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) DBIDs(de.lmu.ifi.dbs.elki.database.ids.DBIDs) HashSetModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.HashSetModifiableDBIDs) ModifiableDBIDs(de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) MaterializedRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation)

Example 34 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class HopkinsStatisticClusteringTendency method run.

/**
 * Runs the algorithm in the timed evaluation part.
 *
 * @param database Database context
 * @param relation Relation to analyze
 */
public Result run(Database database, Relation<NumberVector> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    final DistanceQuery<NumberVector> distanceQuery = database.getDistanceQuery(relation, getDistanceFunction());
    final KNNQuery<NumberVector> knnQuery = database.getKNNQuery(distanceQuery, k + 1);
    final double[] min = new double[dim], extend = new double[dim];
    initializeDataExtends(relation, dim, min, extend);
    if (!LOG.isStatistics()) {
        LOG.warning("This algorithm must be used with at least logging level " + Level.STATISTICS);
    }
    MeanVariance hmean = new MeanVariance(), umean = new MeanVariance(), wmean = new MeanVariance();
    // more stable result
    for (int j = 0; j < this.rep; j++) {
        // Compute NN distances for random objects from within the database
        double w = computeNNForRealData(knnQuery, relation, dim);
        // Compute NN distances for randomly created new uniform objects
        double u = computeNNForUniformData(knnQuery, min, extend);
        // compute hopkins statistik
        // = a / (1+a)
        double h = u / (u + w);
        hmean.put(h);
        umean.put(u);
        wmean.put(w);
    }
    final String prefix = this.getClass().getName();
    LOG.statistics(new LongStatistic(prefix + ".samplesize", sampleSize));
    LOG.statistics(new LongStatistic(prefix + ".dim", dim));
    LOG.statistics(new LongStatistic(prefix + ".hopkins.nearest-neighbor", k));
    LOG.statistics(new DoubleStatistic(prefix + ".hopkins.h.mean", hmean.getMean()));
    LOG.statistics(new DoubleStatistic(prefix + ".hopkins.u.mean", umean.getMean()));
    LOG.statistics(new DoubleStatistic(prefix + ".hopkins.w.mean", wmean.getMean()));
    if (rep > 1) {
        LOG.statistics(new DoubleStatistic(prefix + ".hopkins.h.std", hmean.getSampleStddev()));
        LOG.statistics(new DoubleStatistic(prefix + ".hopkins.u.std", umean.getSampleStddev()));
        LOG.statistics(new DoubleStatistic(prefix + ".hopkins.w.std", wmean.getSampleStddev()));
    }
    // Evaluate:
    double x = hmean.getMean();
    // See Hopkins for a proof that x is supposedly Beta distributed.
    double ix = BetaDistribution.regularizedIncBeta(x, sampleSize, sampleSize);
    double p = (x > .5) ? (1. - ix) : ix;
    LOG.statistics(new DoubleStatistic(prefix + ".hopkins.p", p));
    return null;
}
Also used : DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 35 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class KMeansMinusMinus method meansWithTreshhold.

/**
 * Returns the mean vectors of the given clusters in the given database.
 *
 * @param clusters the clusters to compute the means
 * @param means the recent means
 * @param database the database containing the vectors
 * @return the mean vectors of the given clusters in the given database
 */
protected double[][] meansWithTreshhold(List<? extends ModifiableDoubleDBIDList> clusters, double[][] means, Relation<V> database, Double tresh) {
    // TODO: use Kahan summation for better numerical precision?
    double[][] newMeans = new double[k][];
    for (int i = 0; i < k; i++) {
        DoubleDBIDList list = clusters.get(i);
        double[] raw = null;
        int count = 0;
        // Update with remaining instances
        for (DoubleDBIDListIter iter = list.iter(); iter.valid(); iter.advance()) {
            if (iter.doubleValue() >= tresh) {
                continue;
            }
            NumberVector vec = database.get(iter);
            if (raw == null) {
                // Initialize:
                raw = vec.toArray();
            }
            for (int j = 0; j < raw.length; j++) {
                raw[j] += vec.doubleValue(j);
            }
            count++;
        }
        newMeans[i] = (raw != null) ? VMath.timesEquals(raw, 1.0 / count) : means[i];
    }
    return newMeans;
}
Also used : DoubleDBIDListIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) ModifiableDoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList) DoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDList)

Aggregations

NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)85 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)40 ArrayList (java.util.ArrayList)16 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)9 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)8 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)8 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)8 Database (de.lmu.ifi.dbs.elki.database.Database)7 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)7 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)7 Random (java.util.Random)7 Test (org.junit.Test)7 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)5 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)5 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)5 EvaluationResult (de.lmu.ifi.dbs.elki.result.EvaluationResult)5 MeasurementGroup (de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup)5 List (java.util.List)5 SparseNumberVector (de.lmu.ifi.dbs.elki.data.SparseNumberVector)4 RandomProjectionFamily (de.lmu.ifi.dbs.elki.data.projection.random.RandomProjectionFamily)4