Search in sources :

Example 21 with VectorFieldTypeInformation

use of de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation in project elki by elki-project.

the class AttributeWiseMADNormalization method filter.

@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
    if (objects.dataLength() == 0) {
        return objects;
    }
    for (int r = 0; r < objects.metaLength(); r++) {
        SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
        final List<?> column = (List<?>) objects.getColumn(r);
        if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
            continue;
        }
        @SuppressWarnings("unchecked") final List<V> castColumn = (List<V>) column;
        // Get the replacement type information
        @SuppressWarnings("unchecked") final VectorFieldTypeInformation<V> castType = (VectorFieldTypeInformation<V>) type;
        factory = FilterUtil.guessFactory(castType);
        // Scan to find the best
        final int dim = castType.getDimensionality();
        median = new double[dim];
        imadsigma = new double[dim];
        // Scratch space for testing:
        double[] test = new double[castColumn.size()];
        FiniteProgress dprog = LOG.isVerbose() ? new FiniteProgress("Analyzing data", dim, LOG) : null;
        // access.
        for (int d = 0; d < dim; d++) {
            for (int i = 0; i < test.length; i++) {
                test[i] = castColumn.get(i).doubleValue(d);
            }
            final double med = QuickSelect.median(test);
            median[d] = med;
            int zeros = 0;
            for (int i = 0; i < test.length; i++) {
                if ((test[i] = Math.abs(test[i] - med)) == 0.) {
                    zeros++;
                }
            }
            // Rescale the true MAD for the best standard deviation estimate:
            if (zeros < (test.length >>> 1)) {
                imadsigma[d] = NormalDistribution.PHIINV075 / QuickSelect.median(test);
            } else if (zeros == test.length) {
                LOG.warning("Constant attribute detected. Using MAD=1.");
                // Does not matter. Constant distribution.
                imadsigma[d] = 1.;
            } else {
                // We have more than 50% zeros, so the regular MAD estimate does not
                // work. Generalize the MAD approach to use the 50% non-zero value:
                final int rank = zeros + ((test.length - zeros) >> 1);
                final double rel = .5 + rank * .5 / test.length;
                imadsigma[d] = NormalDistribution.quantile(0., 1., rel) / QuickSelect.quickSelect(test, rank);
                LOG.warning("Near-constant attribute detected. Using modified MAD.");
            }
            LOG.incrementProcessed(dprog);
        }
        LOG.ensureCompleted(dprog);
        FiniteProgress nprog = LOG.isVerbose() ? new FiniteProgress("Data normalization", objects.dataLength(), LOG) : null;
        // Normalization scan
        double[] buf = new double[dim];
        for (int i = 0; i < objects.dataLength(); i++) {
            final V obj = castColumn.get(i);
            for (int d = 0; d < dim; d++) {
                buf[d] = normalize(d, obj.doubleValue(d));
            }
            castColumn.set(i, factory.newNumberVector(buf));
            LOG.incrementProcessed(nprog);
        }
        LOG.ensureCompleted(nprog);
    }
    return objects;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) VectorFieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation) List(java.util.List)

Example 22 with VectorFieldTypeInformation

use of de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation in project elki by elki-project.

the class VectorDimensionalityFilter method updateMeta.

/**
 * Update metadata.
 */
private void updateMeta() {
    meta = new BundleMeta();
    BundleMeta origmeta = source.getMeta();
    for (int i = 0; i < origmeta.size(); i++) {
        SimpleTypeInformation<?> type = origmeta.get(i);
        if (column < 0) {
            // Test whether this type matches
            if (TypeUtil.NUMBER_VECTOR_VARIABLE_LENGTH.isAssignableFromType(type)) {
                if (type instanceof VectorFieldTypeInformation) {
                    @SuppressWarnings("unchecked") final VectorFieldTypeInformation<V> castType = (VectorFieldTypeInformation<V>) type;
                    if (dim != -1 && castType.mindim() > dim) {
                        throw new AbortException("Would filter all vectors: minimum dimensionality " + castType.mindim() + " > desired dimensionality " + dim);
                    }
                    if (dim != -1 && castType.maxdim() < dim) {
                        throw new AbortException("Would filter all vectors: maximum dimensionality " + castType.maxdim() + " < desired dimensionality " + dim);
                    }
                    if (dim == -1) {
                        dim = castType.mindim();
                    }
                    if (castType.mindim() == castType.maxdim()) {
                        meta.add(castType);
                        column = i;
                        continue;
                    }
                }
                @SuppressWarnings("unchecked") final VectorTypeInformation<V> castType = (VectorTypeInformation<V>) type;
                if (dim != -1) {
                    meta.add(new VectorFieldTypeInformation<>(FilterUtil.guessFactory(castType), dim, dim, castType.getSerializer()));
                } else {
                    LOG.warning("No dimensionality yet for column " + i);
                    meta.add(castType);
                }
                column = i;
                continue;
            }
        }
        meta.add(type);
    }
}
Also used : BundleMeta(de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta) VectorFieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation) VectorTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorTypeInformation) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Aggregations

VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)22 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)9 ArrayList (java.util.ArrayList)9 List (java.util.List)8 MaterializedRelation (de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation)7 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)7 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)6 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)6 SimpleTypeInformation (de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation)5 ProxyDatabase (de.lmu.ifi.dbs.elki.database.ProxyDatabase)4 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)4 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)3 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)3 ClassLabel (de.lmu.ifi.dbs.elki.data.ClassLabel)2 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)2 Distribution (de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution)2 Random (java.util.Random)2 ExternalID (de.lmu.ifi.dbs.elki.data.ExternalID)1 IntegerVector (de.lmu.ifi.dbs.elki.data.IntegerVector)1 SimpleClassLabel (de.lmu.ifi.dbs.elki.data.SimpleClassLabel)1