Search in sources :

Example 1 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class ExternalIDFilter method filter.

@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
    MultipleObjectsBundle bundle = new MultipleObjectsBundle();
    // Find a labellist column
    boolean done = false;
    boolean keeplabelcol = false;
    for (int i = 0; i < objects.metaLength(); i++) {
        SimpleTypeInformation<?> meta = objects.meta(i);
        // Skip non-labellist columns - or if we already had a labellist
        if (done || !LabelList.class.equals(meta.getRestrictionClass())) {
            bundle.appendColumn(meta, objects.getColumn(i));
            continue;
        }
        done = true;
        // We split the label column into two parts
        List<ExternalID> eidcol = new ArrayList<>(objects.dataLength());
        List<LabelList> lblcol = new ArrayList<>(objects.dataLength());
        // Split the column
        ArrayList<String> lbuf = new ArrayList<>();
        for (Object obj : objects.getColumn(i)) {
            if (obj != null) {
                LabelList ll = (LabelList) obj;
                int off = externalIdIndex >= 0 ? externalIdIndex : (ll.size() - externalIdIndex);
                eidcol.add(new ExternalID(ll.get(off)));
                lbuf.clear();
                for (int j = 0; j < ll.size(); j++) {
                    if (j == off) {
                        continue;
                    }
                    lbuf.add(ll.get(j));
                }
                lblcol.add(LabelList.make(lbuf));
                if (ll.size() > 0) {
                    keeplabelcol = true;
                }
            } else {
                eidcol.add(null);
                lblcol.add(null);
            }
        }
        bundle.appendColumn(TypeUtil.EXTERNALID, eidcol);
        // Only add the label column when it's not empty.
        if (keeplabelcol) {
            bundle.appendColumn(meta, lblcol);
        }
    }
    return bundle;
}
Also used : ExternalID(de.lmu.ifi.dbs.elki.data.ExternalID) LabelList(de.lmu.ifi.dbs.elki.data.LabelList) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) ArrayList(java.util.ArrayList)

Example 2 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class ArffParser method parse.

@Override
public MultipleObjectsBundle parse(InputStream instream) {
    try (InputStreamReader ir = new InputStreamReader(instream);
        BufferedReader br = new BufferedReader(ir)) {
        ArrayList<String> names = new ArrayList<>(), types = new ArrayList<>();
        readHeader(br);
        parseAttributeStatements(br, names, types);
        // Convert into column mapping. Prepare arrays to fill
        int[] targ = new int[names.size()];
        TypeInformation[] elkitypes = new TypeInformation[names.size()];
        int[] dimsize = new int[names.size()];
        processColumnTypes(names, types, targ, elkitypes, dimsize);
        // Prepare bundle:
        // This is a bit complicated to produce vector fields.
        MultipleObjectsBundle bundle = new MultipleObjectsBundle();
        StreamTokenizer tokenizer = makeArffTokenizer(br);
        int state = 0;
        nextToken(tokenizer);
        while (tokenizer.ttype != StreamTokenizer.TT_EOF) {
            // Parse instance
            if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
            // ignore empty lines
            } else if (tokenizer.ttype != '{') {
                if (state == 0) {
                    setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, false);
                    // dense
                    state = 1;
                } else if (state != 1) {
                    throw new AbortException("Mixing dense and sparse vectors is currently not allowed.");
                }
                // Load a dense instance
                bundle.appendSimple(loadDenseInstance(tokenizer, dimsize, elkitypes, bundle.metaLength()));
            } else {
                if (state == 0) {
                    setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, true);
                    // sparse
                    state = 2;
                } else if (state != 2) {
                    throw new AbortException("Mixing dense and sparse vectors is currently not allowed.");
                }
                bundle.appendSimple(loadSparseInstance(tokenizer, targ, dimsize, elkitypes, bundle.metaLength()));
            }
            nextToken(tokenizer);
        }
        return bundle;
    } catch (IOException e) {
        throw new AbortException("IO error in parser", e);
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) ArrayList(java.util.ArrayList) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) IOException(java.io.IOException) VectorFieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation) TypeInformation(de.lmu.ifi.dbs.elki.data.type.TypeInformation) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) BufferedReader(java.io.BufferedReader) StreamTokenizer(java.io.StreamTokenizer) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 3 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class ReplaceNaNWithRandomFilterTest method parameters.

/**
 * Test with standard normal distribution as parameter.
 */
@Test
public void parameters() {
    String filename = UNITTEST + "nan-test-1.csv";
    ReplaceNaNWithRandomFilter filter = // 
    new ELKIBuilder<>(ReplaceNaNWithRandomFilter.class).with(// 
    ReplaceNaNWithRandomFilter.Parameterizer.REPLACEMENT_DISTRIBUTION, new NormalDistribution(0, 1, new Random(0L))).build();
    MultipleObjectsBundle filteredBundle = readBundle(filename, filter);
    // Load the test data again without a filter.
    MultipleObjectsBundle unfilteredBundle = readBundle(filename);
    // Ensure the first column are the vectors.
    assertTrue("Test file not as expected", TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(filteredBundle.meta(0)));
    assertTrue("Test file not as expected", TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(unfilteredBundle.meta(0)));
    // This cast is now safe (vector field):
    int dimFiltered = ((FieldTypeInformation) unfilteredBundle.meta(0)).getDimensionality();
    int dimUnfiltered = ((FieldTypeInformation) unfilteredBundle.meta(0)).getDimensionality();
    assertEquals("Dimensionality expected equal", dimFiltered, dimUnfiltered);
    // Note the indices of the NaN(s) in the data.
    List<IntegerVector> NaNs = new ArrayList<IntegerVector>();
    for (int row = 0; row < unfilteredBundle.dataLength(); row++) {
        Object obj = unfilteredBundle.data(row, 0);
        assertEquals("Unexpected data type", DoubleVector.class, obj.getClass());
        DoubleVector d = (DoubleVector) obj;
        for (int col = 0; col < dimUnfiltered; col++) {
            final double v = d.doubleValue(col);
            if (Double.isNaN(v)) {
                NaNs.add(new IntegerVector(new int[] { row, col }));
            }
        }
    }
    // Verify that at least a single NaN exists in the unfiltered bundle.
    assertTrue("NaN expected in unfiltered data", NaNs.size() > 0);
    for (IntegerVector iv : NaNs) {
        Object obj = filteredBundle.data(iv.intValue(0), 0);
        assertEquals("Unexpected data type", DoubleVector.class, obj.getClass());
        DoubleVector d = (DoubleVector) obj;
        final double v = d.doubleValue(iv.intValue(1));
        assertFalse("NaN not expected", Double.isNaN(v));
    }
}
Also used : ELKIBuilder(de.lmu.ifi.dbs.elki.utilities.ELKIBuilder) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) ArrayList(java.util.ArrayList) FieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.FieldTypeInformation) IntegerVector(de.lmu.ifi.dbs.elki.data.IntegerVector) Random(java.util.Random) NormalDistribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.NormalDistribution) ReplaceNaNWithRandomFilter(de.lmu.ifi.dbs.elki.datasource.filter.cleaning.ReplaceNaNWithRandomFilter) DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector) Test(org.junit.Test) AbstractDataSourceTest(de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest)

Example 4 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class AttributeWiseBetaNormalizationTest method parameters.

/**
 * Test with parameter p as alpha.
 */
@Test
public void parameters() {
    final double p = .88;
    String filename = UNITTEST + "normally-distributed-data-1.csv";
    AttributeWiseBetaNormalization<DoubleVector> filter = // 
    new ELKIBuilder<AttributeWiseBetaNormalization<DoubleVector>>(AttributeWiseBetaNormalization.class).with(AttributeWiseBetaNormalization.Parameterizer.ALPHA_ID, // 
    p).with(// 
    AttributeWiseBetaNormalization.Parameterizer.DISTRIBUTIONS_ID, // 
    Arrays.asList(NormalMOMEstimator.STATIC, UniformMinMaxEstimator.STATIC)).build();
    MultipleObjectsBundle bundle = readBundle(filename, filter);
    int dim = getFieldDimensionality(bundle, 0, TypeUtil.NUMBER_VECTOR_FIELD);
    BetaDistribution dist = new BetaDistribution(p, p);
    final double quantile = dist.quantile(p);
    // Verify that p% of the values in each column are less than the quantile.
    int[] countUnderQuantile = new int[dim];
    for (int row = 0; row < bundle.dataLength(); row++) {
        DoubleVector d = get(bundle, row, 0, DoubleVector.class);
        for (int col = 0; col < dim; col++) {
            final double v = d.doubleValue(col);
            if (v > Double.NEGATIVE_INFINITY && v < Double.POSITIVE_INFINITY) {
                if (v < quantile) {
                    countUnderQuantile[col]++;
                }
            }
        }
    }
    for (int col = 0; col < dim; col++) {
        double actual = countUnderQuantile[col] / (double) bundle.dataLength();
        assertEquals("p% of the values should be under the quantile", p, actual, .05);
    }
}
Also used : BetaDistribution(de.lmu.ifi.dbs.elki.math.statistics.distribution.BetaDistribution) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector) AbstractDataSourceTest(de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest) Test(org.junit.Test)

Example 5 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class AttributeWiseMinMaxNormalizationTest method testNaNParameters.

/**
 * Test with default parameters and for correcting handling of NaN and Inf.
 */
@Test
public void testNaNParameters() {
    String filename = UNITTEST + "nan-test-1.csv";
    AttributeWiseMinMaxNormalization<DoubleVector> filter = new ELKIBuilder<AttributeWiseMinMaxNormalization<DoubleVector>>(AttributeWiseMinMaxNormalization.class).build();
    MultipleObjectsBundle bundle = readBundle(filename, filter);
    // Ensure the first column are the vectors.
    assertTrue("Test file not as expected", TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(bundle.meta(0)));
    // This cast is now safe (vector field):
    int dim = ((FieldTypeInformation) bundle.meta(0)).getDimensionality();
    // We verify that minimum and maximum values in each column are 0 and 1:
    DoubleMinMax[] mms = DoubleMinMax.newArray(dim);
    for (int row = 0; row < bundle.dataLength(); row++) {
        DoubleVector d = get(bundle, row, 0, DoubleVector.class);
        for (int col = 0; col < dim; col++) {
            final double val = d.doubleValue(col);
            if (val > Double.NEGATIVE_INFINITY && val < Double.POSITIVE_INFINITY) {
                mms[col].put(val);
            }
        }
    }
    for (int col = 0; col < dim; col++) {
        assertEquals("Minimum not as expected", 0., mms[col].getMin(), 0.);
        assertEquals("Maximum not as expected", 1., mms[col].getMax(), 0.);
    }
}
Also used : DoubleMinMax(de.lmu.ifi.dbs.elki.math.DoubleMinMax) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) FieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.FieldTypeInformation) DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector) AbstractDataSourceTest(de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest) Test(org.junit.Test)

Aggregations

MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)72 AbstractDataSourceTest (de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest)37 Test (org.junit.Test)37 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)27 ArrayList (java.util.ArrayList)19 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)13 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)10 ELKIBuilder (de.lmu.ifi.dbs.elki.utilities.ELKIBuilder)10 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)9 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)8 List (java.util.List)7 LabelList (de.lmu.ifi.dbs.elki.data.LabelList)5 SimpleTypeInformation (de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation)5 InputStreamDatabaseConnection (de.lmu.ifi.dbs.elki.datasource.InputStreamDatabaseConnection)5 InputStream (java.io.InputStream)5 ClassLabel (de.lmu.ifi.dbs.elki.data.ClassLabel)4 TypeInformation (de.lmu.ifi.dbs.elki.data.type.TypeInformation)4 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)4 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)4 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)4