Search in sources :

Example 51 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class ShuffleObjectsFilterTest method defaultParameters.

/**
 * Test with default parameters.
 */
@Test
public void defaultParameters() {
    String filename = UNITTEST + "sorted-data-1.csv";
    ShuffleObjectsFilter filter = new ELKIBuilder<>(ShuffleObjectsFilter.class).build();
    MultipleObjectsBundle filteredBundle = readBundle(filename, filter);
    // Load the test data again without a filter.
    MultipleObjectsBundle unfilteredBundle = readBundle(filename);
    // Ensure the first column are the vectors.
    assertEquals("Dimensionality", getFieldDimensionality(unfilteredBundle, 0, TypeUtil.NUMBER_VECTOR_FIELD), getFieldDimensionality(filteredBundle, 0, TypeUtil.NUMBER_VECTOR_FIELD));
    assertEquals("Length changed", unfilteredBundle.dataLength(), filteredBundle.dataLength());
    // Verify that the elements of the unfiltered bundle are in sorted order.
    double prev = get(unfilteredBundle, 0, 0, DoubleVector.class).doubleValue(0);
    for (int row = 1; row < unfilteredBundle.dataLength(); row++) {
        final double next = get(unfilteredBundle, row, 0, DoubleVector.class).doubleValue(0);
        assertTrue("Values are expected to be in sorted order", prev <= next);
        prev = next;
    }
    // Verify that the elements of the filtered bundle are not in sorted order.
    // By verifying this, we can ascertain that the vectors have been shuffled.
    prev = get(filteredBundle, 0, 0, DoubleVector.class).doubleValue(0);
    boolean shuffled = false;
    for (int row = 1; row < filteredBundle.dataLength(); row++) {
        final double next = get(filteredBundle, row, 0, DoubleVector.class).doubleValue(0);
        if (prev > next) {
            shuffled = true;
            break;
        }
    }
    assertTrue("Elements are not shuffled.", shuffled);
}
Also used : MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector) AbstractDataSourceTest(de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest) Test(org.junit.Test)

Example 52 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class ShuffleObjectsFilter method filter.

@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
    if (LOG.isDebugging()) {
        LOG.debug("Shuffling the data set");
    }
    final Random random = rnd.getSingleThreadedRandom();
    final int size = objects.dataLength();
    final int[] offsets = new int[size];
    for (int i = 0; i < size; i++) {
        offsets[i] = i;
    }
    // Randomize the offset array
    for (int i = size; i > 1; i--) {
        final int j = random.nextInt(i);
        // Swap the elements at positions j and i - 1:
        final int temp = offsets[j];
        offsets[j] = offsets[i - 1];
        offsets[i - 1] = temp;
    }
    MultipleObjectsBundle bundle = new MultipleObjectsBundle();
    for (int j = 0; j < objects.metaLength(); j++) {
        // Reorder column accordingly
        List<?> in = objects.getColumn(j);
        List<Object> data = new ArrayList<>(size);
        for (int i = 0; i < size; i++) {
            data.add(in.get(offsets[i]));
        }
        bundle.appendColumn(objects.meta(j), data);
    }
    return bundle;
}
Also used : Random(java.util.Random) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) ArrayList(java.util.ArrayList)

Example 53 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class AbstractSupervisedProjectionVectorFilter method filter.

@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
    final int dataLength = objects.dataLength();
    if (dataLength == 0) {
        return objects;
    }
    List<? extends ClassLabel> classcolumn = null;
    // First of all, identify a class label column.
    for (int r = 0; r < objects.metaLength(); r++) {
        SimpleTypeInformation<?> type = objects.meta(r);
        List<?> column = objects.getColumn(r);
        if (TypeUtil.CLASSLABEL.isAssignableFromType(type)) {
            @SuppressWarnings("unchecked") final List<? extends ClassLabel> castcolumn = (List<? extends ClassLabel>) column;
            classcolumn = castcolumn;
            break;
        }
    }
    if (classcolumn == null) {
        getLogger().warning("No class label column found (try " + ClassLabelFilter.class.getSimpleName() + ") -- cannot run " + this.getClass().getSimpleName());
        return objects;
    }
    boolean somesuccess = false;
    MultipleObjectsBundle bundle = new MultipleObjectsBundle();
    // Secondly, look for columns to train the projection on.
    for (int r = 0; r < objects.metaLength(); r++) {
        SimpleTypeInformation<?> type = objects.meta(r);
        List<?> column = objects.getColumn(r);
        if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
            bundle.appendColumn(type, column);
            continue;
        }
        @SuppressWarnings("unchecked") List<V> vectorcolumn = (List<V>) column;
        final VectorFieldTypeInformation<?> vtype = (VectorFieldTypeInformation<?>) type;
        @SuppressWarnings("unchecked") NumberVector.Factory<V> factory = (NumberVector.Factory<V>) vtype.getFactory();
        int dim = vtype.getDimensionality();
        if (tdim > dim) {
            if (getLogger().isVerbose()) {
                getLogger().verbose("Setting projection dimension to original dimension: projection dimension: " + tdim + " larger than original dimension: " + dim);
            }
            tdim = dim;
        }
        try {
            double[][] proj = computeProjectionMatrix(vectorcolumn, classcolumn, dim);
            for (int i = 0; i < dataLength; i++) {
                double[] pv = times(proj, vectorcolumn.get(i).toArray());
                vectorcolumn.set(i, factory.newNumberVector(pv));
            }
            bundle.appendColumn(convertedType(type, factory), column);
            somesuccess = true;
        } catch (Exception e) {
            getLogger().error("Projection failed -- continuing with unprojected data!", e);
            bundle.appendColumn(type, column);
            continue;
        }
    }
    if (!somesuccess) {
        getLogger().warning("No vector field of fixed dimensionality found.");
        return objects;
    }
    return bundle;
}
Also used : MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) ClassLabel(de.lmu.ifi.dbs.elki.data.ClassLabel) VectorFieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) List(java.util.List) IntList(it.unimi.dsi.fastutil.ints.IntList) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList)

Example 54 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class ConcatenateFilesDatabaseConnection method loadData.

@Override
public MultipleObjectsBundle loadData() {
    MultipleObjectsBundle objects = new MultipleObjectsBundle();
    objects.appendColumn(TypeUtil.STRING, new ArrayList<>());
    for (File file : files) {
        String filestr = file.getPath();
        try (InputStream inputStream = // 
        FileUtil.tryGzipInput(new BufferedInputStream(new FileInputStream(file)))) {
            final BundleStreamSource source;
            if (parser instanceof StreamingParser) {
                final StreamingParser streamParser = (StreamingParser) parser;
                streamParser.initStream(inputStream);
                source = streamParser;
            } else {
                MultipleObjectsBundle parsingResult = parser.parse(inputStream);
                // normalize objects and transform labels
                source = parsingResult.asStream();
            }
            // NullPointerException on invalid streams
            BundleMeta meta = null;
            loop: for (Event e = source.nextEvent(); ; e = source.nextEvent()) {
                switch(e) {
                    case END_OF_STREAM:
                        break loop;
                    case META_CHANGED:
                        meta = source.getMeta();
                        for (int i = 0; i < meta.size(); i++) {
                            if (i + 1 >= objects.metaLength()) {
                                objects.appendColumn(meta.get(i), new ArrayList<>());
                            } else {
                                // Ensure compatibility:
                                if (!objects.meta(i + 1).isAssignableFromType(meta.get(i))) {
                                    throw new AbortException("Incompatible files loaded. Cannot concatenate with unaligned columns, please preprocess manually.");
                                }
                            }
                        }
                        // switch
                        break;
                    case NEXT_OBJECT:
                        Object[] o = new Object[objects.metaLength()];
                        o[0] = filestr;
                        for (int i = 0; i < meta.size(); i++) {
                            o[i + 1] = source.data(i);
                        }
                        objects.appendSimple(o);
                        // switch
                        break;
                }
            }
        } catch (IOException e) {
            throw new AbortException("Loading file " + filestr + " failed: " + e.toString(), e);
        }
    }
    parser.cleanup();
    // Invoke filters
    if (LOG.isDebugging()) {
        LOG.debugFine("Invoking filters.");
    }
    return invokeBundleFilters(objects);
}
Also used : StreamingParser(de.lmu.ifi.dbs.elki.datasource.parser.StreamingParser) BundleMeta(de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) ArrayList(java.util.ArrayList) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) BufferedInputStream(java.io.BufferedInputStream) Event(de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource.Event) File(java.io.File) BundleStreamSource(de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 55 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class LabelJoinDatabaseConnection method loadData.

@Override
public MultipleObjectsBundle loadData() {
    List<MultipleObjectsBundle> bundles = new ArrayList<>(sources.size());
    for (DatabaseConnection dbc : sources) {
        bundles.add(dbc.loadData());
    }
    MultipleObjectsBundle first = bundles.get(0);
    Object2IntOpenHashMap<String> labelmap = new Object2IntOpenHashMap<>(first.dataLength());
    labelmap.defaultReturnValue(-1);
    // Process first bundle
    {
        // Identify a label column
        final int lblcol = FilterUtil.findLabelColumn(first);
        if (lblcol == -1) {
            throw new AbortException("No label column found in first source, cannot join (do you want to use " + ExternalIDJoinDatabaseConnection.class.getSimpleName() + " instead?)");
        }
        for (int i = 0; i < first.dataLength(); i++) {
            Object data = first.data(i, lblcol);
            if (data == null) {
                LOG.warning("Object without label encountered.");
                continue;
            }
            if (data instanceof String) {
                int old = labelmap.put((String) data, i);
                if (old != -1) {
                    LOG.warning("Duplicate label encountered: " + data + " in rows " + old + " and " + i);
                }
            } else if (data instanceof LabelList) {
                final LabelList ll = (LabelList) data;
                for (int j = 0; j < ll.size(); j++) {
                    String lbl = ll.get(j);
                    int old = labelmap.put(lbl, i);
                    if (old != -1) {
                        LOG.warning("Duplicate label encountered: " + lbl + " in rows " + old + " and " + i);
                    }
                }
            } else {
                String lbl = data.toString();
                int old = labelmap.put(lbl, i);
                if (old != -1) {
                    LOG.warning("Duplicate label encountered: " + lbl + " in rows " + old + " and " + i);
                }
            }
        }
    }
    // Process additional columns
    for (int c = 1; c < sources.size(); c++) {
        MultipleObjectsBundle cur = bundles.get(c);
        final int lblcol = FilterUtil.findLabelColumn(cur);
        if (lblcol == -1) {
            throw new AbortException("No label column found in source " + (c + 1) + ", cannot join (do you want to use " + ExternalIDJoinDatabaseConnection.class.getSimpleName() + " instead?)");
        }
        // Destination columns
        List<ArrayList<Object>> dcol = new ArrayList<>(cur.metaLength());
        for (int i = 0; i < cur.metaLength(); i++) {
            // Skip the label columns
            if (i == lblcol) {
                dcol.add(null);
                continue;
            }
            ArrayList<Object> newcol = new ArrayList<>(first.dataLength());
            // Pre-fill with nulls.
            for (int j = 0; j < first.dataLength(); j++) {
                newcol.add(null);
            }
            first.appendColumn(cur.meta(i), newcol);
            dcol.add(newcol);
        }
        for (int i = 0; i < cur.dataLength(); i++) {
            Object data = cur.data(i, lblcol);
            if (data == null) {
                LOG.warning("Object without label encountered.");
                continue;
            }
            int row = -1;
            if (data instanceof String) {
                row = labelmap.getInt(data);
            } else if (data instanceof LabelList) {
                final LabelList ll = (LabelList) data;
                for (int j = 0; j < ll.size(); j++) {
                    row = labelmap.getInt(ll.get(j));
                    if (row >= 0) {
                        break;
                    }
                }
            } else {
                row = labelmap.getInt(data.toString());
            }
            if (row < 0) {
                LOG.warning("Label not found for join: " + data + " in row " + i);
                continue;
            }
            for (int d = 0; d < cur.metaLength(); d++) {
                if (d == lblcol) {
                    continue;
                }
                List<Object> col = dcol.get(d);
                assert (col != null);
                col.set(row, cur.data(i, d));
            }
        }
    }
    for (int i = 0; i < first.dataLength(); i++) {
        for (int d = 0; d < first.metaLength(); d++) {
            if (first.data(i, d) == null) {
                StringBuilder buf = new StringBuilder();
                for (int d2 = 0; d2 < first.metaLength(); d2++) {
                    if (buf.length() > 0) {
                        buf.append(", ");
                    }
                    if (first.data(i, d2) == null) {
                        buf.append("null");
                    } else {
                        buf.append(first.data(i, d2));
                    }
                }
                LOG.warning("null value in joined data, row " + i + " column " + d + FormatUtil.NEWLINE + "[" + buf.toString() + "]");
                break;
            }
        }
    }
    return first;
}
Also used : LabelList(de.lmu.ifi.dbs.elki.data.LabelList) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) ArrayList(java.util.ArrayList) Object2IntOpenHashMap(it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Aggregations

MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)72 AbstractDataSourceTest (de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest)37 Test (org.junit.Test)37 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)27 ArrayList (java.util.ArrayList)19 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)13 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)10 ELKIBuilder (de.lmu.ifi.dbs.elki.utilities.ELKIBuilder)10 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)9 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)8 List (java.util.List)7 LabelList (de.lmu.ifi.dbs.elki.data.LabelList)5 SimpleTypeInformation (de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation)5 InputStreamDatabaseConnection (de.lmu.ifi.dbs.elki.datasource.InputStreamDatabaseConnection)5 InputStream (java.io.InputStream)5 ClassLabel (de.lmu.ifi.dbs.elki.data.ClassLabel)4 TypeInformation (de.lmu.ifi.dbs.elki.data.type.TypeInformation)4 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)4 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)4 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)4