use of de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution in project elki by elki-project.
the class GeneratorXMLSpec method writeClusters.
/**
* Write the resulting clusters to an output stream.
*
* @param outStream output stream
* @param data Generated data
* @throws IOException thrown on write errors
*/
public void writeClusters(OutputStreamWriter outStream, MultipleObjectsBundle data) throws IOException {
int modelcol = -1;
{
// Find model column
for (int i = 0; i < data.metaLength(); i++) {
if (Model.TYPE.isAssignableFromType(data.meta(i))) {
modelcol = i;
break;
}
}
}
if (modelcol < 0) {
throw new AbortException("No model column found in bundle.");
}
ArrayList<Model> models = new ArrayList<>();
Map<Model, IntArrayList> modelMap = new HashMap<>();
{
// Build a map from model to the actual objects
for (int i = 0; i < data.dataLength(); i++) {
Model model = (Model) data.data(i, modelcol);
IntArrayList modelids = modelMap.get(model);
if (modelids == null) {
models.add(model);
modelids = new IntArrayList();
modelMap.put(model, modelids);
}
modelids.add(i);
}
}
// compute global discard values
int totalsize = 0, totaldisc = 0;
for (Entry<Model, IntArrayList> ent : modelMap.entrySet()) {
totalsize += ent.getValue().size();
if (ent.getKey() instanceof GeneratorSingleCluster) {
totaldisc += ((GeneratorSingleCluster) ent.getKey()).getDiscarded();
}
}
double globdens = (double) (totalsize + totaldisc) / totalsize;
outStream.append("########################################################").append(LINE_SEPARATOR);
outStream.append("## Number of clusters: " + models.size()).append(LINE_SEPARATOR);
for (Model model : models) {
IntArrayList ids = modelMap.get(model);
outStream.append("########################################################").append(LINE_SEPARATOR);
outStream.append("## Size: " + ids.size()).append(LINE_SEPARATOR);
if (model instanceof GeneratorSingleCluster) {
GeneratorSingleCluster cursclus = (GeneratorSingleCluster) model;
outStream.append("########################################################").append(LINE_SEPARATOR);
outStream.append("## Cluster: ").append(cursclus.getName()).append(LINE_SEPARATOR);
double[] cmin = cursclus.getClipmin();
double[] cmax = cursclus.getClipmax();
if (cmin != null && cmax != null) {
//
outStream.append("## Clipping: ").append(FormatUtil.format(cmin)).append(" - ").append(FormatUtil.format(cmax)).append(LINE_SEPARATOR);
}
outStream.append("## Density correction factor: " + cursclus.getDensityCorrection()).append(LINE_SEPARATOR);
outStream.append("## Generators:").append(LINE_SEPARATOR);
for (int i = 0; i < cursclus.getDim(); i++) {
Distribution gen = cursclus.getDistribution(i);
outStream.append("## ").append(gen.toString()).append(LINE_SEPARATOR);
}
if (cursclus.getTransformation() != null && cursclus.getTransformation().getTransformation() != null) {
outStream.append("## Affine transformation matrix:").append(LINE_SEPARATOR);
outStream.append(FormatUtil.format(cursclus.getTransformation().getTransformation(), "## ")).append(LINE_SEPARATOR);
}
outStream.append("## Discards: " + cursclus.getDiscarded() + " Retries left: " + cursclus.getRetries()).append(LINE_SEPARATOR);
double corf = /* cursclus.overweight */
(double) (cursclus.getSize() + cursclus.getDiscarded()) / cursclus.getSize() / globdens;
outStream.append("## Density correction factor estimation: " + corf).append(LINE_SEPARATOR);
}
outStream.append("########################################################").append(LINE_SEPARATOR);
for (IntIterator iter = ids.iterator(); iter.hasNext(); ) {
int num = iter.nextInt();
for (int c = 0; c < data.metaLength(); c++) {
if (c != modelcol) {
if (c > 0) {
outStream.append(' ');
}
outStream.append(data.data(num, c).toString());
}
}
outStream.append(LINE_SEPARATOR);
}
}
}
use of de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution in project elki by elki-project.
the class TrivialGeneratedOutlier method run.
/**
* Run the algorithm
*
* @param models Model relation
* @param vecs Vector relation
* @param labels Label relation
* @return Outlier result
*/
public OutlierResult run(Relation<Model> models, Relation<NumberVector> vecs, Relation<?> labels) {
WritableDoubleDataStore scores = DataStoreUtil.makeDoubleStorage(models.getDBIDs(), DataStoreFactory.HINT_HOT);
HashSet<GeneratorSingleCluster> generators = new HashSet<>();
for (DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) {
Model model = models.get(iditer);
if (model instanceof GeneratorSingleCluster) {
generators.add((GeneratorSingleCluster) model);
}
}
if (generators.isEmpty()) {
LOG.warning("No generator models found for dataset - all points will be considered outliers.");
}
for (GeneratorSingleCluster gen : generators) {
for (int i = 0; i < gen.getDim(); i++) {
Distribution dist = gen.getDistribution(i);
if (!(dist instanceof NormalDistribution)) {
throw new AbortException("TrivialGeneratedOutlier currently only supports normal distributions, got: " + dist);
}
}
}
for (DBIDIter iditer = models.iterDBIDs(); iditer.valid(); iditer.advance()) {
double score = 1.;
double[] v = vecs.get(iditer).toArray();
for (GeneratorSingleCluster gen : generators) {
double[] tv = v;
// Transform backwards
if (gen.getTransformation() != null) {
tv = gen.getTransformation().applyInverse(v);
}
final int dim = tv.length;
double lensq = 0.0;
int norm = 0;
for (int i = 0; i < dim; i++) {
Distribution dist = gen.getDistribution(i);
if (dist instanceof NormalDistribution) {
NormalDistribution d = (NormalDistribution) dist;
double delta = (tv[i] - d.getMean()) / d.getStddev();
lensq += delta * delta;
norm += 1;
} else {
throw new AbortException("TrivialGeneratedOutlier currently only supports normal distributions, got: " + dist);
}
}
if (norm > 0.) {
// The squared distances are ChiSquared distributed
score = Math.min(score, ChiSquaredDistribution.cdf(lensq, norm));
} else {
score = 0.;
}
}
if (expect < 1) {
score = expect * score / (1 - score + expect);
}
scores.putDouble(iditer, score);
}
DoubleRelation scoreres = new MaterializedDoubleRelation("Model outlier scores", "model-outlier", scores, models.getDBIDs());
OutlierScoreMeta meta = new ProbabilisticOutlierScore(0., 1.);
return new OutlierResult(meta, scoreres);
}
use of de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution in project elki by elki-project.
the class AttributeWiseBetaNormalization method filter.
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
if (objects.dataLength() == 0) {
return objects;
}
for (int r = 0; r < objects.metaLength(); r++) {
SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
final List<?> column = (List<?>) objects.getColumn(r);
if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
continue;
}
@SuppressWarnings("unchecked") final List<V> castColumn = (List<V>) column;
// Get the replacement type information
@SuppressWarnings("unchecked") final VectorFieldTypeInformation<V> castType = (VectorFieldTypeInformation<V>) type;
factory = FilterUtil.guessFactory(castType);
// Scan to find the best
final int dim = castType.getDimensionality();
dists = new ArrayList<>(dim);
// Scratch space for testing:
double[] test = new double[castColumn.size()];
// We iterate over dimensions, this kind of filter needs fast random
// access.
Adapter adapter = new Adapter();
for (int d = 0; d < dim; d++) {
adapter.dim = d;
Distribution dist = findBestFit(castColumn, adapter, d, test);
if (LOG.isVerbose()) {
LOG.verbose("Best fit for dimension " + d + ": " + dist.toString());
}
dists.add(dist);
}
// Beta distribution for projection
double p = FastMath.pow(alpha, -1 / FastMath.sqrt(dim));
BetaDistribution beta = new BetaDistribution(p, p);
// Normalization scan
double[] buf = new double[dim];
for (int i = 0; i < objects.dataLength(); i++) {
final V obj = castColumn.get(i);
for (int d = 0; d < dim; d++) {
// TODO: when available, use logspace for better numerical precision!
buf[d] = beta.quantile(dists.get(d).cdf(obj.doubleValue(d)));
}
castColumn.set(i, factory.newNumberVector(buf));
}
}
return objects;
}
use of de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution in project elki by elki-project.
the class AttributeWiseCDFNormalization method findBestFit.
/**
* Find the best fitting distribution.
*
* @param col Column of table
* @param adapter Adapter for accessing the data
* @param d Dimension
* @param test Scatch space for testing goodness of fit
* @return Best fit distribution
*/
protected Distribution findBestFit(final List<V> col, Adapter adapter, int d, double[] test) {
if (estimators.size() == 1) {
return estimators.get(0).estimate(col, adapter);
}
Distribution best = null;
double bestq = Double.POSITIVE_INFINITY;
trials: for (DistributionEstimator<?> est : estimators) {
try {
Distribution dist = est.estimate(col, adapter);
for (int i = 0; i < test.length; i++) {
test[i] = dist.cdf(col.get(i).doubleValue(d));
if (Double.isNaN(test[i])) {
LOG.warning("Got NaN after fitting " + est.toString() + ": " + dist.toString());
continue trials;
}
if (Double.isInfinite(test[i])) {
LOG.warning("Got infinite value after fitting " + est.toString() + ": " + dist.toString());
continue trials;
}
}
Arrays.sort(test);
double q = KolmogorovSmirnovTest.simpleTest(test);
if (LOG.isVeryVerbose()) {
LOG.veryverbose("Estimator " + est.toString() + " (" + dist.toString() + ") has maximum deviation " + q + " for dimension " + d);
}
if (best == null || q < bestq) {
best = dist;
bestq = q;
}
} catch (ArithmeticException e) {
if (LOG.isVeryVerbose()) {
LOG.veryverbose("Fitting distribution " + est + " failed: " + e.getMessage());
}
continue trials;
}
}
if (LOG.isVerbose()) {
LOG.verbose("Best fit for dimension " + d + ": " + best.toString());
}
return best;
}
use of de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution in project elki by elki-project.
the class BestFitEstimatorTest method testInf.
@Test(expected = ArithmeticException.class)
public void testInf() {
BestFitEstimator est = init();
Distribution edist = est.estimate(new double[] { Double.POSITIVE_INFINITY }, DoubleArrayAdapter.STATIC);
assertEquals("Wrong class of distribution", UniformDistribution.class, edist.getClass());
}
Aggregations