Search in sources :

Example 51 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class ShuffleObjectsFilterTest method defaultParameters.

 * Test with default parameters.
public void defaultParameters() {
    String filename = UNITTEST + "sorted-data-1.csv";
    ShuffleObjectsFilter filter = new ELKIBuilder<>(ShuffleObjectsFilter.class).build();
    MultipleObjectsBundle filteredBundle = readBundle(filename, filter);
    // Load the test data again without a filter.
    MultipleObjectsBundle unfilteredBundle = readBundle(filename);
    // Ensure the first column are the vectors.
    assertEquals("Dimensionality", getFieldDimensionality(unfilteredBundle, 0, TypeUtil.NUMBER_VECTOR_FIELD), getFieldDimensionality(filteredBundle, 0, TypeUtil.NUMBER_VECTOR_FIELD));
    assertEquals("Length changed", unfilteredBundle.dataLength(), filteredBundle.dataLength());
    // Verify that the elements of the unfiltered bundle are in sorted order.
    double prev = get(unfilteredBundle, 0, 0, DoubleVector.class).doubleValue(0);
    for (int row = 1; row < unfilteredBundle.dataLength(); row++) {
        final double next = get(unfilteredBundle, row, 0, DoubleVector.class).doubleValue(0);
        assertTrue("Values are expected to be in sorted order", prev <= next);
        prev = next;
    // Verify that the elements of the filtered bundle are not in sorted order.
    // By verifying this, we can ascertain that the vectors have been shuffled.
    prev = get(filteredBundle, 0, 0, DoubleVector.class).doubleValue(0);
    boolean shuffled = false;
    for (int row = 1; row < filteredBundle.dataLength(); row++) {
        final double next = get(filteredBundle, row, 0, DoubleVector.class).doubleValue(0);
        if (prev > next) {
            shuffled = true;
    assertTrue("Elements are not shuffled.", shuffled);
Also used : MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) DoubleVector( AbstractDataSourceTest(de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest) Test(org.junit.Test)

Example 52 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class ShuffleObjectsFilter method filter.

public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
    if (LOG.isDebugging()) {
        LOG.debug("Shuffling the data set");
    final Random random = rnd.getSingleThreadedRandom();
    final int size = objects.dataLength();
    final int[] offsets = new int[size];
    for (int i = 0; i < size; i++) {
        offsets[i] = i;
    // Randomize the offset array
    for (int i = size; i > 1; i--) {
        final int j = random.nextInt(i);
        // Swap the elements at positions j and i - 1:
        final int temp = offsets[j];
        offsets[j] = offsets[i - 1];
        offsets[i - 1] = temp;
    MultipleObjectsBundle bundle = new MultipleObjectsBundle();
    for (int j = 0; j < objects.metaLength(); j++) {
        // Reorder column accordingly
        List<?> in = objects.getColumn(j);
        List<Object> data = new ArrayList<>(size);
        for (int i = 0; i < size; i++) {
        bundle.appendColumn(objects.meta(j), data);
    return bundle;
Also used : Random(java.util.Random) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) ArrayList(java.util.ArrayList)

Example 53 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class AbstractSupervisedProjectionVectorFilter method filter.

public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
    final int dataLength = objects.dataLength();
    if (dataLength == 0) {
        return objects;
    List<? extends ClassLabel> classcolumn = null;
    // First of all, identify a class label column.
    for (int r = 0; r < objects.metaLength(); r++) {
        SimpleTypeInformation<?> type = objects.meta(r);
        List<?> column = objects.getColumn(r);
        if (TypeUtil.CLASSLABEL.isAssignableFromType(type)) {
            @SuppressWarnings("unchecked") final List<? extends ClassLabel> castcolumn = (List<? extends ClassLabel>) column;
            classcolumn = castcolumn;
    if (classcolumn == null) {
        getLogger().warning("No class label column found (try " + ClassLabelFilter.class.getSimpleName() + ") -- cannot run " + this.getClass().getSimpleName());
        return objects;
    boolean somesuccess = false;
    MultipleObjectsBundle bundle = new MultipleObjectsBundle();
    // Secondly, look for columns to train the projection on.
    for (int r = 0; r < objects.metaLength(); r++) {
        SimpleTypeInformation<?> type = objects.meta(r);
        List<?> column = objects.getColumn(r);
        if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
            bundle.appendColumn(type, column);
        @SuppressWarnings("unchecked") List<V> vectorcolumn = (List<V>) column;
        final VectorFieldTypeInformation<?> vtype = (VectorFieldTypeInformation<?>) type;
        @SuppressWarnings("unchecked") NumberVector.Factory<V> factory = (NumberVector.Factory<V>) vtype.getFactory();
        int dim = vtype.getDimensionality();
        if (tdim > dim) {
            if (getLogger().isVerbose()) {
                getLogger().verbose("Setting projection dimension to original dimension: projection dimension: " + tdim + " larger than original dimension: " + dim);
            tdim = dim;
        try {
            double[][] proj = computeProjectionMatrix(vectorcolumn, classcolumn, dim);
            for (int i = 0; i < dataLength; i++) {
                double[] pv = times(proj, vectorcolumn.get(i).toArray());
                vectorcolumn.set(i, factory.newNumberVector(pv));
            bundle.appendColumn(convertedType(type, factory), column);
            somesuccess = true;
        } catch (Exception e) {
            getLogger().error("Projection failed -- continuing with unprojected data!", e);
            bundle.appendColumn(type, column);
    if (!somesuccess) {
        getLogger().warning("No vector field of fixed dimensionality found.");
        return objects;
    return bundle;
Also used : MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) ClassLabel( VectorFieldTypeInformation( NumberVector( List(java.util.List) IntList(it.unimi.dsi.fastutil.ints.IntList) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList)

Example 54 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class ConcatenateFilesDatabaseConnection method loadData.

public MultipleObjectsBundle loadData() {
    MultipleObjectsBundle objects = new MultipleObjectsBundle();
    objects.appendColumn(TypeUtil.STRING, new ArrayList<>());
    for (File file : files) {
        String filestr = file.getPath();
        try (InputStream inputStream = // 
        FileUtil.tryGzipInput(new BufferedInputStream(new FileInputStream(file)))) {
            final BundleStreamSource source;
            if (parser instanceof StreamingParser) {
                final StreamingParser streamParser = (StreamingParser) parser;
                source = streamParser;
            } else {
                MultipleObjectsBundle parsingResult = parser.parse(inputStream);
                // normalize objects and transform labels
                source = parsingResult.asStream();
            // NullPointerException on invalid streams
            BundleMeta meta = null;
            loop: for (Event e = source.nextEvent(); ; e = source.nextEvent()) {
                switch(e) {
                    case END_OF_STREAM:
                        break loop;
                    case META_CHANGED:
                        meta = source.getMeta();
                        for (int i = 0; i < meta.size(); i++) {
                            if (i + 1 >= objects.metaLength()) {
                                objects.appendColumn(meta.get(i), new ArrayList<>());
                            } else {
                                // Ensure compatibility:
                                if (!objects.meta(i + 1).isAssignableFromType(meta.get(i))) {
                                    throw new AbortException("Incompatible files loaded. Cannot concatenate with unaligned columns, please preprocess manually.");
                        // switch
                    case NEXT_OBJECT:
                        Object[] o = new Object[objects.metaLength()];
                        o[0] = filestr;
                        for (int i = 0; i < meta.size(); i++) {
                            o[i + 1] =;
                        // switch
        } catch (IOException e) {
            throw new AbortException("Loading file " + filestr + " failed: " + e.toString(), e);
    // Invoke filters
    if (LOG.isDebugging()) {
        LOG.debugFine("Invoking filters.");
    return invokeBundleFilters(objects);
Also used : StreamingParser(de.lmu.ifi.dbs.elki.datasource.parser.StreamingParser) BundleMeta(de.lmu.ifi.dbs.elki.datasource.bundle.BundleMeta) BufferedInputStream( FileInputStream( InputStream( MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) ArrayList(java.util.ArrayList) IOException( FileInputStream( BufferedInputStream( Event(de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource.Event) File( BundleStreamSource(de.lmu.ifi.dbs.elki.datasource.bundle.BundleStreamSource) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 55 with MultipleObjectsBundle

use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.

the class LabelJoinDatabaseConnection method loadData.

public MultipleObjectsBundle loadData() {
    List<MultipleObjectsBundle> bundles = new ArrayList<>(sources.size());
    for (DatabaseConnection dbc : sources) {
    MultipleObjectsBundle first = bundles.get(0);
    Object2IntOpenHashMap<String> labelmap = new Object2IntOpenHashMap<>(first.dataLength());
    // Process first bundle
        // Identify a label column
        final int lblcol = FilterUtil.findLabelColumn(first);
        if (lblcol == -1) {
            throw new AbortException("No label column found in first source, cannot join (do you want to use " + ExternalIDJoinDatabaseConnection.class.getSimpleName() + " instead?)");
        for (int i = 0; i < first.dataLength(); i++) {
            Object data =, lblcol);
            if (data == null) {
                LOG.warning("Object without label encountered.");
            if (data instanceof String) {
                int old = labelmap.put((String) data, i);
                if (old != -1) {
                    LOG.warning("Duplicate label encountered: " + data + " in rows " + old + " and " + i);
            } else if (data instanceof LabelList) {
                final LabelList ll = (LabelList) data;
                for (int j = 0; j < ll.size(); j++) {
                    String lbl = ll.get(j);
                    int old = labelmap.put(lbl, i);
                    if (old != -1) {
                        LOG.warning("Duplicate label encountered: " + lbl + " in rows " + old + " and " + i);
            } else {
                String lbl = data.toString();
                int old = labelmap.put(lbl, i);
                if (old != -1) {
                    LOG.warning("Duplicate label encountered: " + lbl + " in rows " + old + " and " + i);
    // Process additional columns
    for (int c = 1; c < sources.size(); c++) {
        MultipleObjectsBundle cur = bundles.get(c);
        final int lblcol = FilterUtil.findLabelColumn(cur);
        if (lblcol == -1) {
            throw new AbortException("No label column found in source " + (c + 1) + ", cannot join (do you want to use " + ExternalIDJoinDatabaseConnection.class.getSimpleName() + " instead?)");
        // Destination columns
        List<ArrayList<Object>> dcol = new ArrayList<>(cur.metaLength());
        for (int i = 0; i < cur.metaLength(); i++) {
            // Skip the label columns
            if (i == lblcol) {
            ArrayList<Object> newcol = new ArrayList<>(first.dataLength());
            // Pre-fill with nulls.
            for (int j = 0; j < first.dataLength(); j++) {
            first.appendColumn(cur.meta(i), newcol);
        for (int i = 0; i < cur.dataLength(); i++) {
            Object data =, lblcol);
            if (data == null) {
                LOG.warning("Object without label encountered.");
            int row = -1;
            if (data instanceof String) {
                row = labelmap.getInt(data);
            } else if (data instanceof LabelList) {
                final LabelList ll = (LabelList) data;
                for (int j = 0; j < ll.size(); j++) {
                    row = labelmap.getInt(ll.get(j));
                    if (row >= 0) {
            } else {
                row = labelmap.getInt(data.toString());
            if (row < 0) {
                LOG.warning("Label not found for join: " + data + " in row " + i);
            for (int d = 0; d < cur.metaLength(); d++) {
                if (d == lblcol) {
                List<Object> col = dcol.get(d);
                assert (col != null);
                col.set(row,, d));
    for (int i = 0; i < first.dataLength(); i++) {
        for (int d = 0; d < first.metaLength(); d++) {
            if (, d) == null) {
                StringBuilder buf = new StringBuilder();
                for (int d2 = 0; d2 < first.metaLength(); d2++) {
                    if (buf.length() > 0) {
                        buf.append(", ");
                    if (, d2) == null) {
                    } else {
                        buf.append(, d2));
                LOG.warning("null value in joined data, row " + i + " column " + d + FormatUtil.NEWLINE + "[" + buf.toString() + "]");
    return first;
Also used : LabelList( MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) ArrayList(java.util.ArrayList) Object2IntOpenHashMap(it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)


MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)72 AbstractDataSourceTest (de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest)37 Test (org.junit.Test)37 DoubleVector ( ArrayList (java.util.ArrayList)19 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)13 NumberVector ( ELKIBuilder (de.lmu.ifi.dbs.elki.utilities.ELKIBuilder)10 VectorFieldTypeInformation ( MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)8 List (java.util.List)7 LabelList ( SimpleTypeInformation ( InputStreamDatabaseConnection (de.lmu.ifi.dbs.elki.datasource.InputStreamDatabaseConnection)5 InputStream ( ClassLabel ( TypeInformation ( DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)4 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)4 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)4