Search in sources :

Example 11 with SimpleTypeInformation

use of de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation in project elki by elki-project.

the class AbstractConversionFilter method filter.

/**
 * A standard implementation of the filter process. First of all, all suitable
 * representations are found. Then (if {@link #prepareStart} returns true),
 * the data is processed read-only in a first pass.
 *
 * In the main pass, each object is then filtered using
 * {@link #filterSingleObject}.
 *
 * @param objects Objects to filter
 * @return Filtered bundle
 */
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
    if (objects.dataLength() == 0) {
        return objects;
    }
    MultipleObjectsBundle bundle = new MultipleObjectsBundle();
    final Logging logger = getLogger();
    for (int r = 0; r < objects.metaLength(); r++) {
        @SuppressWarnings("unchecked") SimpleTypeInformation<Object> type = (SimpleTypeInformation<Object>) objects.meta(r);
        @SuppressWarnings("unchecked") final List<Object> column = (List<Object>) objects.getColumn(r);
        if (!getInputTypeRestriction().isAssignableFromType(type)) {
            bundle.appendColumn(type, column);
            continue;
        }
        // Get the replacement type information
        @SuppressWarnings("unchecked") final SimpleTypeInformation<I> castType = (SimpleTypeInformation<I>) type;
        // When necessary, perform an initialization scan
        if (prepareStart(castType)) {
            FiniteProgress pprog = logger.isVerbose() ? new FiniteProgress("Preparing normalization", objects.dataLength(), logger) : null;
            for (Object o : column) {
                @SuppressWarnings("unchecked") final I obj = (I) o;
                prepareProcessInstance(obj);
                logger.incrementProcessed(pprog);
            }
            logger.ensureCompleted(pprog);
            prepareComplete();
        }
        @SuppressWarnings("unchecked") final List<O> castColumn = (List<O>) column;
        bundle.appendColumn(convertedType(castType), castColumn);
        // Normalization scan
        FiniteProgress nprog = logger.isVerbose() ? new FiniteProgress("Data normalization", objects.dataLength(), logger) : null;
        for (int i = 0; i < objects.dataLength(); i++) {
            @SuppressWarnings("unchecked") final I obj = (I) column.get(i);
            final O normalizedObj = filterSingleObject(obj);
            castColumn.set(i, normalizedObj);
            logger.incrementProcessed(nprog);
        }
        logger.ensureCompleted(nprog);
    }
    return bundle;
}
Also used : Logging(de.lmu.ifi.dbs.elki.logging.Logging) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) List(java.util.List)

Example 12 with SimpleTypeInformation

use of de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation in project elki by elki-project.

the class AbstractStreamConversionFilter method nextEvent.

@Override
public Event nextEvent() {
    Event ev = source.nextEvent();
    if (ev == Event.META_CHANGED) {
        if (meta == null) {
            meta = new BundleMeta();
        }
        BundleMeta origmeta = source.getMeta();
        for (int i = meta.size(); i < origmeta.size(); i++) {
            if (column < 0) {
                @SuppressWarnings("unchecked") SimpleTypeInformation<Object> type = (SimpleTypeInformation<Object>) origmeta.get(i);
                // Test whether this type matches
                if (getInputTypeRestriction().isAssignableFromType(type)) {
                    @SuppressWarnings("unchecked") final SimpleTypeInformation<I> castType = (SimpleTypeInformation<I>) type;
                    meta.add(convertedType(castType));
                    column = i;
                    continue;
                }
            }
            meta.add(origmeta.get(i));
        }
    }
    return ev;
}
Also used : BundleMeta(de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation)

Example 13 with SimpleTypeInformation

use of de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation in project elki by elki-project.

the class SplitNumberVectorFilter method filter.

@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
    if (objects.dataLength() == 0) {
        return objects;
    }
    MultipleObjectsBundle bundle = new MultipleObjectsBundle();
    for (int r = 0; r < objects.metaLength(); r++) {
        @SuppressWarnings("unchecked") SimpleTypeInformation<Object> type = (SimpleTypeInformation<Object>) objects.meta(r);
        @SuppressWarnings("unchecked") final List<Object> column = (List<Object>) objects.getColumn(r);
        if (!getInputTypeRestriction().isAssignableFromType(type)) {
            bundle.appendColumn(type, column);
            continue;
        }
        // Should be a vector type after above test.
        @SuppressWarnings("unchecked") final VectorFieldTypeInformation<V> vtype = VectorFieldTypeInformation.class.cast(type);
        NumberVector.Factory<V> factory = FilterUtil.guessFactory(vtype);
        // Get the replacement type informations
        VectorFieldTypeInformation<V> type1 = new VectorFieldTypeInformation<>(factory, dims.length);
        VectorFieldTypeInformation<V> type2 = new VectorFieldTypeInformation<>(factory, vtype.getDimensionality() - dims.length);
        final List<V> col1 = new ArrayList<>(column.size());
        final List<V> col2 = new ArrayList<>(column.size());
        bundle.appendColumn(type1, col1);
        bundle.appendColumn(type2, col2);
        // Build other dimensions array.
        int[] odims = new int[vtype.getDimensionality() - dims.length];
        {
            int i = 0;
            for (int d = 0; d < vtype.getDimensionality(); d++) {
                boolean found = false;
                for (int j = 0; j < dims.length; j++) {
                    if (dims[j] == d) {
                        found = true;
                        break;
                    }
                }
                if (!found) {
                    if (i >= odims.length) {
                        throw new AbortException("Dimensionalities not proper!");
                    }
                    odims[i] = d;
                    i++;
                }
            }
        }
        // Splitting scan.
        for (int i = 0; i < objects.dataLength(); i++) {
            @SuppressWarnings("unchecked") final V obj = (V) column.get(i);
            double[] part1 = new double[dims.length];
            double[] part2 = new double[obj.getDimensionality() - dims.length];
            for (int d = 0; d < dims.length; d++) {
                part1[d] = obj.doubleValue(dims[d]);
            }
            for (int d = 0; d < odims.length; d++) {
                part2[d] = obj.doubleValue(odims[d]);
            }
            col1.add(factory.newNumberVector(part1));
            col2.add(factory.newNumberVector(part2));
        }
    }
    return bundle;
}
Also used : MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) ArrayList(java.util.ArrayList) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) VectorFieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation) ArrayList(java.util.ArrayList) List(java.util.List) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 14 with SimpleTypeInformation

use of de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation in project elki by elki-project.

the class AttributeWiseCDFNormalization method filter.

@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
    if (objects.dataLength() == 0) {
        return objects;
    }
    for (int r = 0; r < objects.metaLength(); r++) {
        SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
        final List<?> column = (List<?>) objects.getColumn(r);
        if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
            continue;
        }
        @SuppressWarnings("unchecked") final List<V> castColumn = (List<V>) column;
        // Get the replacement type information
        @SuppressWarnings("unchecked") final VectorFieldTypeInformation<V> castType = (VectorFieldTypeInformation<V>) type;
        factory = FilterUtil.guessFactory(castType);
        // Scan to find the best
        final int dim = castType.getDimensionality();
        dists = new ArrayList<>(dim);
        // Scratch space for testing:
        double[] test = estimators.size() > 1 ? new double[castColumn.size()] : null;
        // We iterate over dimensions, this kind of filter needs fast random
        // access.
        Adapter adapter = new Adapter();
        for (int d = 0; d < dim; d++) {
            adapter.dim = d;
            Distribution dist = findBestFit(castColumn, adapter, d, test);
            // We want them to remain 0, instead of - usually - becoming constant .5
            if (dist instanceof UniformDistribution) {
                dist = constantZero(castColumn, adapter) ? new UniformDistribution(0., 1.) : dist;
            }
            dists.add(dist);
        }
        // Normalization scan
        double[] buf = new double[dim];
        for (int i = 0; i < objects.dataLength(); i++) {
            final V obj = castColumn.get(i);
            for (int d = 0; d < dim; d++) {
                buf[d] = dists.get(d).cdf(obj.doubleValue(d));
            }
            castColumn.set(i, factory.newNumberVector(buf));
        }
    }
    return objects;
}
Also used : UniformDistribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) NumberArrayAdapter(de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.NumberArrayAdapter) VectorFieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation) Distribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution) UniformDistribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.UniformDistribution) ArrayList(java.util.ArrayList) List(java.util.List)

Example 15 with SimpleTypeInformation

use of de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation in project elki by elki-project.

the class AttributeWiseMADNormalization method filter.

@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
    if (objects.dataLength() == 0) {
        return objects;
    }
    for (int r = 0; r < objects.metaLength(); r++) {
        SimpleTypeInformation<?> type = (SimpleTypeInformation<?>) objects.meta(r);
        final List<?> column = (List<?>) objects.getColumn(r);
        if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
            continue;
        }
        @SuppressWarnings("unchecked") final List<V> castColumn = (List<V>) column;
        // Get the replacement type information
        @SuppressWarnings("unchecked") final VectorFieldTypeInformation<V> castType = (VectorFieldTypeInformation<V>) type;
        factory = FilterUtil.guessFactory(castType);
        // Scan to find the best
        final int dim = castType.getDimensionality();
        median = new double[dim];
        imadsigma = new double[dim];
        // Scratch space for testing:
        double[] test = new double[castColumn.size()];
        FiniteProgress dprog = LOG.isVerbose() ? new FiniteProgress("Analyzing data", dim, LOG) : null;
        // access.
        for (int d = 0; d < dim; d++) {
            for (int i = 0; i < test.length; i++) {
                test[i] = castColumn.get(i).doubleValue(d);
            }
            final double med = QuickSelect.median(test);
            median[d] = med;
            int zeros = 0;
            for (int i = 0; i < test.length; i++) {
                if ((test[i] = Math.abs(test[i] - med)) == 0.) {
                    zeros++;
                }
            }
            // Rescale the true MAD for the best standard deviation estimate:
            if (zeros < (test.length >>> 1)) {
                imadsigma[d] = NormalDistribution.PHIINV075 / QuickSelect.median(test);
            } else if (zeros == test.length) {
                LOG.warning("Constant attribute detected. Using MAD=1.");
                // Does not matter. Constant distribution.
                imadsigma[d] = 1.;
            } else {
                // We have more than 50% zeros, so the regular MAD estimate does not
                // work. Generalize the MAD approach to use the 50% non-zero value:
                final int rank = zeros + ((test.length - zeros) >> 1);
                final double rel = .5 + rank * .5 / test.length;
                imadsigma[d] = NormalDistribution.quantile(0., 1., rel) / QuickSelect.quickSelect(test, rank);
                LOG.warning("Near-constant attribute detected. Using modified MAD.");
            }
            LOG.incrementProcessed(dprog);
        }
        LOG.ensureCompleted(dprog);
        FiniteProgress nprog = LOG.isVerbose() ? new FiniteProgress("Data normalization", objects.dataLength(), LOG) : null;
        // Normalization scan
        double[] buf = new double[dim];
        for (int i = 0; i < objects.dataLength(); i++) {
            final V obj = castColumn.get(i);
            for (int d = 0; d < dim; d++) {
                buf[d] = normalize(d, obj.doubleValue(d));
            }
            castColumn.set(i, factory.newNumberVector(buf));
            LOG.incrementProcessed(nprog);
        }
        LOG.ensureCompleted(nprog);
    }
    return objects;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) VectorFieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation) List(java.util.List)

Aggregations

SimpleTypeInformation (de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation)15 List (java.util.List)6 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)5 MaterializedRelation (de.lmu.ifi.dbs.elki.database.relation.MaterializedRelation)4 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)4 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)4 ArrayList (java.util.ArrayList)4 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)3 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)3 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)3 ParameterizationFunction (de.lmu.ifi.dbs.elki.algorithm.clustering.correlation.cash.ParameterizationFunction)2 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)2 ModifiableDBIDs (de.lmu.ifi.dbs.elki.database.ids.ModifiableDBIDs)2 Relation (de.lmu.ifi.dbs.elki.database.relation.Relation)2 BundleMeta (de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta)2 Index (de.lmu.ifi.dbs.elki.index.Index)2 IndexFactory (de.lmu.ifi.dbs.elki.index.IndexFactory)2 Distribution (de.lmu.ifi.dbs.elki.math.statistics.distribution.Distribution)2 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)1 ExternalID (de.lmu.ifi.dbs.elki.data.ExternalID)1