Search in sources :

Example 1 with ExternalID

use of de.lmu.ifi.dbs.elki.data.ExternalID in project elki by elki-project.

the class ExternalIDFilter method filter.

@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
    MultipleObjectsBundle bundle = new MultipleObjectsBundle();
    // Find a labellist column
    boolean done = false;
    boolean keeplabelcol = false;
    for (int i = 0; i < objects.metaLength(); i++) {
        SimpleTypeInformation<?> meta = objects.meta(i);
        // Skip non-labellist columns - or if we already had a labellist
        if (done || !LabelList.class.equals(meta.getRestrictionClass())) {
            bundle.appendColumn(meta, objects.getColumn(i));
            continue;
        }
        done = true;
        // We split the label column into two parts
        List<ExternalID> eidcol = new ArrayList<>(objects.dataLength());
        List<LabelList> lblcol = new ArrayList<>(objects.dataLength());
        // Split the column
        ArrayList<String> lbuf = new ArrayList<>();
        for (Object obj : objects.getColumn(i)) {
            if (obj != null) {
                LabelList ll = (LabelList) obj;
                int off = externalIdIndex >= 0 ? externalIdIndex : (ll.size() - externalIdIndex);
                eidcol.add(new ExternalID(ll.get(off)));
                lbuf.clear();
                for (int j = 0; j < ll.size(); j++) {
                    if (j == off) {
                        continue;
                    }
                    lbuf.add(ll.get(j));
                }
                lblcol.add(LabelList.make(lbuf));
                if (ll.size() > 0) {
                    keeplabelcol = true;
                }
            } else {
                eidcol.add(null);
                lblcol.add(null);
            }
        }
        bundle.appendColumn(TypeUtil.EXTERNALID, eidcol);
        // Only add the label column when it's not empty.
        if (keeplabelcol) {
            bundle.appendColumn(meta, lblcol);
        }
    }
    return bundle;
}
Also used : ExternalID(de.lmu.ifi.dbs.elki.data.ExternalID) LabelList(de.lmu.ifi.dbs.elki.data.LabelList) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) ArrayList(java.util.ArrayList)

Example 2 with ExternalID

use of de.lmu.ifi.dbs.elki.data.ExternalID in project elki by elki-project.

the class ArffParser method loadDenseInstance.

private Object[] loadDenseInstance(StreamTokenizer tokenizer, int[] dimsize, TypeInformation[] etyp, int outdim) throws IOException {
    Object[] data = new Object[outdim];
    for (int out = 0; out < outdim; out++) {
        if (TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[out])) {
            // For multi-column vectors, read successive columns
            double[] cur = new double[dimsize[out]];
            for (int k = 0; k < dimsize[out]; k++) {
                if (tokenizer.ttype == '?') {
                    cur[k] = Double.NaN;
                } else if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
                    try {
                        cur[k] = ParseUtil.parseDouble(tokenizer.sval);
                    } catch (NumberFormatException e) {
                        throw new AbortException("Expected number value, got: " + tokenizer.sval);
                    }
                } else {
                    throw new AbortException("Expected word token, got: " + tokenizer.toString());
                }
                nextToken(tokenizer);
            }
            data[out] = denseFactory.newNumberVector(cur);
        } else if (TypeUtil.LABELLIST.equals(etyp[out])) {
            // Build a label list out of successive labels
            labels.clear();
            for (int k = 0; k < dimsize[out]; k++) {
                if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
                    throw new AbortException("Expected word token, got: " + tokenizer.toString());
                }
                labels.add(tokenizer.sval);
                nextToken(tokenizer);
            }
            data[out] = LabelList.make(labels);
        } else if (TypeUtil.EXTERNALID.equals(etyp[out])) {
            if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
                throw new AbortException("Expected word token, got: " + tokenizer.toString());
            }
            data[out] = new ExternalID(tokenizer.sval);
            nextToken(tokenizer);
        } else if (TypeUtil.CLASSLABEL.equals(etyp[out])) {
            if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
                throw new AbortException("Expected word token, got: " + tokenizer.toString());
            }
            // TODO: support other class label types.
            ClassLabel lbl = new SimpleClassLabel(tokenizer.sval);
            data[out] = lbl;
            nextToken(tokenizer);
        } else {
            throw new AbortException("Unsupported type for column " + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null"));
        }
    }
    return data;
}
Also used : SimpleClassLabel(de.lmu.ifi.dbs.elki.data.SimpleClassLabel) ClassLabel(de.lmu.ifi.dbs.elki.data.ClassLabel) ExternalID(de.lmu.ifi.dbs.elki.data.ExternalID) SimpleClassLabel(de.lmu.ifi.dbs.elki.data.SimpleClassLabel) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 3 with ExternalID

use of de.lmu.ifi.dbs.elki.data.ExternalID in project elki by elki-project.

the class ArffParser method setupBundleHeaders.

/**
 * Setup the headers for the object bundle.
 *
 * @param names Attribute names
 * @param targ Target columns
 * @param etyp ELKI type information
 * @param dimsize Number of dimensions in the individual types
 * @param bundle Output bundle
 * @param sparse Flag to create sparse vectors
 */
private void setupBundleHeaders(ArrayList<String> names, int[] targ, TypeInformation[] etyp, int[] dimsize, MultipleObjectsBundle bundle, boolean sparse) {
    for (int in = 0, out = 0; in < targ.length; out++) {
        int nin = in + 1;
        for (; nin < targ.length; nin++) {
            if (targ[nin] != targ[in]) {
                break;
            }
        }
        if (TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[out])) {
            String[] labels = new String[dimsize[out]];
            // Collect labels:
            for (int i = 0; i < dimsize[out]; i++) {
                labels[i] = names.get(out + i);
            }
            if (!sparse) {
                VectorFieldTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, dimsize[out], labels);
                bundle.appendColumn(type, new ArrayList<DoubleVector>());
            } else {
                VectorFieldTypeInformation<SparseDoubleVector> type = new VectorFieldTypeInformation<>(SparseDoubleVector.FACTORY, dimsize[out], labels);
                bundle.appendColumn(type, new ArrayList<SparseDoubleVector>());
            }
        } else if (TypeUtil.LABELLIST.equals(etyp[out])) {
            StringBuilder label = new StringBuilder(names.get(out));
            for (int i = 1; i < dimsize[out]; i++) {
                label.append(' ').append(names.get(out + i));
            }
            bundle.appendColumn(new SimpleTypeInformation<>(LabelList.class, label.toString()), new ArrayList<LabelList>());
        } else if (TypeUtil.EXTERNALID.equals(etyp[out])) {
            bundle.appendColumn(new SimpleTypeInformation<>(ExternalID.class, names.get(out)), new ArrayList<ExternalID>());
        } else if (TypeUtil.CLASSLABEL.equals(etyp[out])) {
            bundle.appendColumn(new SimpleTypeInformation<>(ClassLabel.class, names.get(out)), new ArrayList<ClassLabel>());
        } else {
            throw new AbortException("Unsupported type for column " + in + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null"));
        }
        assert (out == bundle.metaLength() - 1);
        in = nin;
    }
}
Also used : ExternalID(de.lmu.ifi.dbs.elki.data.ExternalID) ArrayList(java.util.ArrayList) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) SparseDoubleVector(de.lmu.ifi.dbs.elki.data.SparseDoubleVector) VectorFieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation) DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector) SparseDoubleVector(de.lmu.ifi.dbs.elki.data.SparseDoubleVector) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 4 with ExternalID

use of de.lmu.ifi.dbs.elki.data.ExternalID in project elki by elki-project.

the class ArffParser method loadSparseInstance.

private Object[] loadSparseInstance(StreamTokenizer tokenizer, int[] targ, int[] dimsize, TypeInformation[] elkitypes, int metaLength) throws IOException {
    Int2ObjectOpenHashMap<Object> map = new Int2ObjectOpenHashMap<>();
    while (true) {
        nextToken(tokenizer);
        assert (tokenizer.ttype != StreamTokenizer.TT_EOF && tokenizer.ttype != StreamTokenizer.TT_EOL);
        if (tokenizer.ttype == '}') {
            nextToken(tokenizer);
            assert (tokenizer.ttype == StreamTokenizer.TT_EOF || tokenizer.ttype == StreamTokenizer.TT_EOL);
            break;
        } else {
            // sparse token
            if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
                throw new AbortException("Unexpected token type encountered: " + tokenizer.toString() + " type: " + tokenizer.ttype);
            }
            int dim = ParseUtil.parseIntBase10(tokenizer.sval);
            if (map.containsKey(dim)) {
                throw new AbortException("Duplicate key in sparse vector: " + tokenizer.toString());
            }
            nextToken(tokenizer);
            if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
                map.put(dim, // 
                TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[targ[dim]]) ? (Double) ParseUtil.parseDouble(tokenizer.sval) : tokenizer.sval);
            } else {
                throw new AbortException("Unexpected token type encountered: " + tokenizer.toString());
            }
        }
    }
    Object[] data = new Object[metaLength];
    for (int out = 0; out < metaLength; out++) {
        // Find the first index
        int s = -1;
        for (int i = 0; i < targ.length; i++) {
            if (targ[i] == out && s < 0) {
                s = i;
                break;
            }
        }
        assert (s >= 0);
        if (TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[out])) {
            Int2DoubleOpenHashMap f = new Int2DoubleOpenHashMap(dimsize[out]);
            for (ObjectIterator<Int2ObjectMap.Entry<Object>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
                Int2ObjectMap.Entry<Object> entry = iter.next();
                int i = entry.getIntKey();
                if (i < s || i >= s + dimsize[out]) {
                    continue;
                }
                double v = ((Double) entry.getValue()).doubleValue();
                f.put(i - s, v);
            }
            data[out] = new SparseDoubleVector(f, dimsize[out]);
        } else if (TypeUtil.LABELLIST.equals(elkitypes[out])) {
            // Build a label list out of successive labels
            labels.clear();
            for (ObjectIterator<Int2ObjectMap.Entry<Object>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
                Int2ObjectMap.Entry<Object> entry = iter.next();
                int i = entry.getIntKey();
                if (i < s) {
                    continue;
                }
                if (i >= s + dimsize[out]) {
                    break;
                }
                if (labels.size() < i - s) {
                    LOG.warning("Sparse consecutive labels are currently not correctly supported.");
                }
                labels.add((String) entry.getValue());
            }
            data[out] = LabelList.make(labels);
        } else if (TypeUtil.EXTERNALID.equals(elkitypes[out])) {
            String val = (String) map.get(s);
            if (val == null) {
                throw new AbortException("External ID column not set in sparse instance." + tokenizer.toString());
            }
            data[out] = new ExternalID(val);
        } else if (TypeUtil.CLASSLABEL.equals(elkitypes[out])) {
            Object val = map.get(s);
            if (val == null) {
                throw new AbortException("Class label column not set in sparse instance." + tokenizer.toString());
            }
            // TODO: support other class label types.
            ClassLabel lbl = new SimpleClassLabel(String.valueOf(val));
            data[out] = lbl;
        } else {
            throw new AbortException("Unsupported type for column " + "->" + out + ": " + ((elkitypes[out] != null) ? elkitypes[out].toString() : "null"));
        }
    }
    return data;
}
Also used : Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) ExternalID(de.lmu.ifi.dbs.elki.data.ExternalID) Int2ObjectMap(it.unimi.dsi.fastutil.ints.Int2ObjectMap) SimpleClassLabel(de.lmu.ifi.dbs.elki.data.SimpleClassLabel) SparseDoubleVector(de.lmu.ifi.dbs.elki.data.SparseDoubleVector) ObjectIterator(it.unimi.dsi.fastutil.objects.ObjectIterator) SimpleClassLabel(de.lmu.ifi.dbs.elki.data.SimpleClassLabel) ClassLabel(de.lmu.ifi.dbs.elki.data.ClassLabel) Int2DoubleOpenHashMap(it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 5 with ExternalID

use of de.lmu.ifi.dbs.elki.data.ExternalID in project elki by elki-project.

the class ExternalIDJoinDatabaseConnection method loadData.

@Override
public MultipleObjectsBundle loadData() {
    List<MultipleObjectsBundle> bundles = new ArrayList<>(sources.size());
    for (DatabaseConnection dbc : sources) {
        bundles.add(dbc.loadData());
    }
    MultipleObjectsBundle first = bundles.get(0);
    Object2IntOpenHashMap<ExternalID> labelmap = new Object2IntOpenHashMap<>(first.dataLength());
    labelmap.defaultReturnValue(-1);
    // Process first bundle
    {
        // Identify a label column
        final int lblcol;
        {
            int lblc = -1;
            for (int i = 0; i < first.metaLength(); i++) {
                if (TypeUtil.EXTERNALID.isAssignableFromType(first.meta(i))) {
                    lblc = i;
                    break;
                }
            }
            // make static
            lblcol = lblc;
        }
        if (lblcol == -1) {
            throw new AbortException("No external ID column found in primary source.");
        }
        for (int i = 0; i < first.dataLength(); i++) {
            ExternalID data = (ExternalID) first.data(i, lblcol);
            if (data == null) {
                LOG.debug("Object without ID encountered.");
                continue;
            }
            int old = labelmap.put(data, i);
            if (old != -1) {
                LOG.debug("Duplicate id encountered: " + data + " in rows " + old + " and " + i);
            }
        }
    }
    // Process additional columns
    for (int c = 1; c < sources.size(); c++) {
        MultipleObjectsBundle cur = bundles.get(c);
        final int lblcol;
        {
            int lblc = -1;
            for (int i = 0; i < cur.metaLength(); i++) {
                if (TypeUtil.EXTERNALID.isAssignableFromType(cur.meta(i))) {
                    lblc = i;
                    break;
                }
            }
            // make static
            lblcol = lblc;
        }
        if (lblcol == -1) {
            StringBuilder buf = new StringBuilder();
            for (int i = 0; i < cur.metaLength(); i++) {
                if (buf.length() > 0) {
                    buf.append(',');
                }
                buf.append(cur.meta(i));
            }
            throw new AbortException("No external ID column found in source " + (c + 1) + " to join with. Got: " + buf.toString());
        }
        // Destination columns
        List<ArrayList<Object>> dcol = new ArrayList<>(cur.metaLength());
        for (int i = 0; i < cur.metaLength(); i++) {
            // Skip the label columns
            if (i == lblcol) {
                dcol.add(null);
                continue;
            }
            ArrayList<Object> newcol = new ArrayList<>(first.dataLength());
            // Pre-fill with nulls.
            for (int j = 0; j < first.dataLength(); j++) {
                newcol.add(null);
            }
            first.appendColumn(cur.meta(i), newcol);
            dcol.add(newcol);
        }
        for (int i = 0; i < cur.dataLength(); i++) {
            ExternalID data = (ExternalID) cur.data(i, lblcol);
            if (data == null) {
                LOG.warning("Object without label encountered.");
                continue;
            }
            int row = labelmap.getInt(data);
            if (row == -1) {
                LOG.debug("ID not found for join: " + data + " in row " + i);
                continue;
            }
            for (int d = 0; d < cur.metaLength(); d++) {
                if (d == lblcol) {
                    continue;
                }
                List<Object> col = dcol.get(d);
                assert (col != null);
                col.set(row, cur.data(i, d));
            }
        }
    }
    for (int i = 0; i < first.dataLength(); i++) {
        for (int d = 0; d < first.metaLength(); d++) {
            if (first.data(i, d) == null) {
                StringBuilder buf = new StringBuilder();
                for (int d2 = 0; d2 < first.metaLength(); d2++) {
                    if (buf.length() > 0) {
                        buf.append(", ");
                    }
                    if (first.data(i, d2) == null) {
                        buf.append("null");
                    } else {
                        buf.append(first.data(i, d2));
                    }
                }
                LOG.warning("null value in joined data, row " + i + " column " + d + FormatUtil.NEWLINE + "[" + buf.toString() + "]");
                break;
            }
        }
    }
    return first;
}
Also used : ExternalID(de.lmu.ifi.dbs.elki.data.ExternalID) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) ArrayList(java.util.ArrayList) Object2IntOpenHashMap(it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Aggregations

ExternalID (de.lmu.ifi.dbs.elki.data.ExternalID)6 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)4 ArrayList (java.util.ArrayList)4 ClassLabel (de.lmu.ifi.dbs.elki.data.ClassLabel)2 SimpleClassLabel (de.lmu.ifi.dbs.elki.data.SimpleClassLabel)2 SparseDoubleVector (de.lmu.ifi.dbs.elki.data.SparseDoubleVector)2 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)2 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)1 LabelList (de.lmu.ifi.dbs.elki.data.LabelList)1 Polygon (de.lmu.ifi.dbs.elki.data.spatial.Polygon)1 PolygonsObject (de.lmu.ifi.dbs.elki.data.spatial.PolygonsObject)1 SimpleTypeInformation (de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation)1 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)1 Int2DoubleOpenHashMap (it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap)1 Int2ObjectMap (it.unimi.dsi.fastutil.ints.Int2ObjectMap)1 Int2ObjectOpenHashMap (it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap)1 Object2IntOpenHashMap (it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap)1 ObjectIterator (it.unimi.dsi.fastutil.objects.ObjectIterator)1 Matcher (java.util.regex.Matcher)1