Search in sources :

Example 6 with Distribution

use of de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution in project elki by elki-project.

the class GeneratorXMLSpec method writeClusters.

/**
 * Write the resulting clusters to an output stream.
 *
 * @param outStream output stream
 * @param data Generated data
 * @throws IOException thrown on write errors
 */
public void writeClusters(OutputStreamWriter outStream, MultipleObjectsBundle data) throws IOException {
    int modelcol = -1;
    {
        // Find model column
        for (int i = 0; i < data.metaLength(); i++) {
            if (Model.TYPE.isAssignableFromType(data.meta(i))) {
                modelcol = i;
                break;
            }
        }
    }
    if (modelcol < 0) {
        throw new AbortException("No model column found in bundle.");
    }
    ArrayList<Model> models = new ArrayList<>();
    Map<Model, IntArrayList> modelMap = new HashMap<>();
    {
        // Build a map from model to the actual objects
        for (int i = 0; i < data.dataLength(); i++) {
            Model model = (Model) data.data(i, modelcol);
            IntArrayList modelids = modelMap.get(model);
            if (modelids == null) {
                models.add(model);
                modelids = new IntArrayList();
                modelMap.put(model, modelids);
            }
            modelids.add(i);
        }
    }
    // compute global discard values
    int totalsize = 0, totaldisc = 0;
    for (Entry<Model, IntArrayList> ent : modelMap.entrySet()) {
        totalsize += ent.getValue().size();
        if (ent.getKey() instanceof GeneratorSingleCluster) {
            totaldisc += ((GeneratorSingleCluster) ent.getKey()).getDiscarded();
        }
    }
    double globdens = (double) (totalsize + totaldisc) / totalsize;
    outStream.append("########################################################").append(LINE_SEPARATOR);
    outStream.append("## Number of clusters: " + models.size()).append(LINE_SEPARATOR);
    for (Model model : models) {
        IntArrayList ids = modelMap.get(model);
        outStream.append("########################################################").append(LINE_SEPARATOR);
        outStream.append("## Size: " + ids.size()).append(LINE_SEPARATOR);
        if (model instanceof GeneratorSingleCluster) {
            GeneratorSingleCluster cursclus = (GeneratorSingleCluster) model;
            outStream.append("########################################################").append(LINE_SEPARATOR);
            outStream.append("## Cluster: ").append(cursclus.getName()).append(LINE_SEPARATOR);
            double[] cmin = cursclus.getClipmin();
            double[] cmax = cursclus.getClipmax();
            if (cmin != null && cmax != null) {
                // 
                outStream.append("## Clipping: ").append(FormatUtil.format(cmin)).append(" - ").append(FormatUtil.format(cmax)).append(LINE_SEPARATOR);
            }
            outStream.append("## Density correction factor: " + cursclus.getDensityCorrection()).append(LINE_SEPARATOR);
            outStream.append("## Generators:").append(LINE_SEPARATOR);
            for (int i = 0; i < cursclus.getDim(); i++) {
                Distribution gen = cursclus.getDistribution(i);
                outStream.append("##   ").append(gen.toString()).append(LINE_SEPARATOR);
            }
            if (cursclus.getTransformation() != null && cursclus.getTransformation().getTransformation() != null) {
                outStream.append("## Affine transformation matrix:").append(LINE_SEPARATOR);
                outStream.append(FormatUtil.format(cursclus.getTransformation().getTransformation(), "## ")).append(LINE_SEPARATOR);
            }
            outStream.append("## Discards: " + cursclus.getDiscarded() + " Retries left: " + cursclus.getRetries()).append(LINE_SEPARATOR);
            double corf = /* cursclus.overweight */
            (double) (cursclus.getSize() + cursclus.getDiscarded()) / cursclus.getSize() / globdens;
            outStream.append("## Density correction factor estimation: " + corf).append(LINE_SEPARATOR);
        }
        outStream.append("########################################################").append(LINE_SEPARATOR);
        for (IntIterator iter = ids.iterator(); iter.hasNext(); ) {
            int num = iter.nextInt();
            for (int c = 0; c < data.metaLength(); c++) {
                if (c != modelcol) {
                    if (c > 0) {
                        outStream.append(' ');
                    }
                    outStream.append(data.data(num, c).toString());
                }
            }
            outStream.append(LINE_SEPARATOR);
        }
    }
}
Also used : IntIterator(it.unimi.dsi.fastutil.ints.IntIterator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList) GeneratorSingleCluster(de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorSingleCluster) Distribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution) Model(de.lmu.ifi.dbs.elki.data.model.Model) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 7 with Distribution

use of de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution in project elki by elki-project.

the class TrivialGeneratedOutlier method run.

/**
 * Run the algorithm
 *
 * @param models Model relation
 * @param vecs Vector relation
 * @param labels Label relation
 * @return Outlier result
 */
public OutlierResult run(Relation<Model> models, Relation<NumberVector> vecs, Relation<?> labels) {
    WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(models.getDBIDs(), DataStoreFactory.HINT_HOT);
    HashSet<GeneratorSingleCluster> generators = new HashSet<>();
    for (DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) {
        Model model = models.get(iditer);
        if (model instanceof GeneratorSingleCluster) {
            generators.add((GeneratorSingleCluster) model);
        }
    }
    if (generators.isEmpty()) {
        LOG.warning("No generator models found for dataset - all points will be considered outliers.");
    }
    for (GeneratorSingleCluster gen : generators) {
        for (int i = 0; i < gen.getDim(); i++) {
            Distribution dist = gen.getDistribution(i);
            if (!(dist instanceof NormalDistribution)) {
                throw new AbortException("TrivialGeneratedOutlier currently only supports normal distributions, got: " + dist);
            }
        }
    }
    for (DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) {
        double score = 1.;
        double[] v = vecs.get(iditer).toArray();
        for (GeneratorSingleCluster gen : generators) {
            double[] tv = v;
            // Transform backwards
            if (gen.getTransformation() != null) {
                tv = gen.getTransformation().applyInverse(v);
            }
            final int dim = tv.length;
            double lensq = 0.0;
            int norm = 0;
            for (int i = 0; i < dim; i++) {
                Distribution dist = gen.getDistribution(i);
                if (dist instanceof NormalDistribution) {
                    NormalDistribution d = (NormalDistribution) dist;
                    double delta = (tv[i] - d.getMean()) / d.getStddev();
                    lensq += delta * delta;
                    norm += 1;
                } else {
                    throw new AbortException("TrivialGeneratedOutlier currently only supports normal distributions, got: " + dist);
                }
            }
            if (norm > 0.) {
                // The squared distances are ChiSquared distributed
                score = Math.min(score, ChiSquaredDistribution.cdf(lensq, norm));
            } else {
                score = 0.;
            }
        }
        if (expect < 1) {
            score = expect * score / (1 - score + expect);
        }
        scores.putDouble(iditer, score);
    }
    DoubleRelation scoreres = new MaterializedDoubleRelation("Model outlier scores", "model-outlier", scores, models.getDBIDs());
    OutlierScoreMeta meta = new ProbabilisticOutlierScore(0., 1.);
    return new OutlierResult(meta, scoreres);
}
Also used : WritableDoubleDataStore(de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore) GeneratorSingleCluster(de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorSingleCluster) OutlierResult(de.lmu.ifi.dbs.elki.result.outlier.OutlierResult) ProbabilisticOutlierScore(de.lmu.ifi.dbs.elki.result.outlier.ProbabilisticOutlierScore) DoubleRelation(de.lmu.ifi.dbs.elki.database.relation.DoubleRelation) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation) OutlierScoreMeta(de.lmu.ifi.dbs.elki.result.outlier.OutlierScoreMeta) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) NormalDistribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution) Distribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution) NormalDistribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution) ChiSquaredDistribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.ChiSquaredDistribution) Model(de.lmu.ifi.dbs.elki.data.model.Model) MaterializedDoubleRelation(de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation) HashSet(java.util.HashSet) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 8 with Distribution

use of de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution in project elki by elki-project.

the class AttributeWiseBetaNormalization method filter.

@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
    if (objects.dataLength() == 0) {
        return objects;
    }
    for (int r = 0; r < objects.metaLength(); r++) {
        SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
        final List<?> column = (List<?>) objects.getColumn(r);
        if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
            continue;
        }
        @SuppressWarnings("unchecked") final List<V> castColumn = (List<V>) column;
        // Get the replacement type information
        @SuppressWarnings("unchecked") final VectorFieldTypeInformation<V> castType = (VectorFieldTypeInformation<V>) type;
        factory = FilterUtil.guessFactory(castType);
        // Scan to find the best
        final int dim = castType.getDimensionality();
        dists = new ArrayList<>(dim);
        // Scratch space for testing:
        double[] test = new double[castColumn.size()];
        // We iterate over dimensions, this kind of filter needs fast random
        // access.
        Adapter adapter = new Adapter();
        for (int d = 0; d < dim; d++) {
            adapter.dim = d;
            Distribution dist = findBestFit(castColumn, adapter, d, test);
            if (LOG.isVerbose()) {
                LOG.verbose("Best fit for dimension " + d + ": " + dist.toString());
            }
            dists.add(dist);
        }
        // Beta distribution for projection
        double p = FastMath.pow(alpha, -1 / FastMath.sqrt(dim));
        BetaDistribution beta = new BetaDistribution(p, p);
        // Normalization scan
        double[] buf = new double[dim];
        for (int i = 0; i < objects.dataLength(); i++) {
            final V obj = castColumn.get(i);
            for (int d = 0; d < dim; d++) {
                // TODO: when available, use logspace for better numerical precision!
                buf[d] = beta.quantile(dists.get(d).cdf(obj.doubleValue(d)));
            }
            castColumn.set(i, factory.newNumberVector(buf));
        }
    }
    return objects;
}
Also used : SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) BetaDistribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.BetaDistribution) VectorFieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation) BetaDistribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.BetaDistribution) Distribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution) ArrayList(java.util.ArrayList) List(java.util.List)

Example 9 with Distribution

use of de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution in project elki by elki-project.

the class AttributeWiseCDFNormalization method findBestFit.

/**
 * Find the best fitting distribution.
 *
 * @param col Column of table
 * @param adapter Adapter for accessing the data
 * @param d Dimension
 * @param test Scatch space for testing goodness of fit
 * @return Best fit distribution
 */
protected Distribution findBestFit(final List<V> col, Adapter adapter, int d, double[] test) {
    if (estimators.size() == 1) {
        return estimators.get(0).estimate(col, adapter);
    }
    Distribution best = null;
    double bestq = Double.POSITIVE_INFINITY;
    trials: for (DistributionEstimator<?> est : estimators) {
        try {
            Distribution dist = est.estimate(col, adapter);
            for (int i = 0; i < test.length; i++) {
                test[i] = dist.cdf(col.get(i).doubleValue(d));
                if (Double.isNaN(test[i])) {
                    LOG.warning("Got NaN after fitting " + est.toString() + ": " + dist.toString());
                    continue trials;
                }
                if (Double.isInfinite(test[i])) {
                    LOG.warning("Got infinite value after fitting " + est.toString() + ": " + dist.toString());
                    continue trials;
                }
            }
            Arrays.sort(test);
            double q = KolmogorovSmirnovTest.simpleTest(test);
            if (LOG.isVeryVerbose()) {
                LOG.veryverbose("Estimator " + est.toString() + " (" + dist.toString() + ") has maximum deviation " + q + " for dimension " + d);
            }
            if (best == null || q < bestq) {
                best = dist;
                bestq = q;
            }
        } catch (ArithmeticException e) {
            if (LOG.isVeryVerbose()) {
                LOG.veryverbose("Fitting distribution " + est + " failed: " + e.getMessage());
            }
            continue trials;
        }
    }
    if (LOG.isVerbose()) {
        LOG.verbose("Best fit for dimension " + d + ": " + best.toString());
    }
    return best;
}
Also used : Distribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution) UniformDistribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution) DistributionEstimator(de.lmu.ifi.dbs.elki.math.statistics.distribution.estimator.DistributionEstimator)

Example 10 with Distribution

use of de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution in project elki by elki-project.

the class BestFitEstimatorTest method testInf.

@Test(expected = ArithmeticException.class)
public void testInf() {
    BestFitEstimator est = init();
    Distribution edist = est.estimate(new double[] { Double.POSITIVE_INFINITY }, DoubleArrayAdapter.STATIC);
    assertEquals("Wrong class of distribution", UniformDistribution.class, edist.getClass());
}
Also used : UniformDistribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution) Distribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution) NormalDistribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution) Test(org.junit.Test)

Aggregations

Distribution (de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution)15 UniformDistribution (de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution)12 NormalDistribution (de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution)11 Test (org.junit.Test)6 Random (java.util.Random)5 GammaDistribution (de.lmu.ifi.dbs.elki.math.statistics.distribution.GammaDistribution)4 HaltonUniformDistribution (de.lmu.ifi.dbs.elki.math.statistics.distribution.HaltonUniformDistribution)4 XMLNodeIterator (de.lmu.ifi.dbs.elki.utilities.xml.XMLNodeIterator)4 Element (org.w3c.dom.Element)4 Node (org.w3c.dom.Node)4 ArrayList (java.util.ArrayList)3 Model (de.lmu.ifi.dbs.elki.data.model.Model)2 GeneratorSingleCluster (de.lmu.ifi.dbs.elki.data.synthetic.bymodel.GeneratorSingleCluster)2 SimpleTypeInformation (de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation)2 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)2 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)2 List (java.util.List)2 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)1 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)1 DoubleRelation (de.lmu.ifi.dbs.elki.database.relation.DoubleRelation)1