Search in sources :

Example 1 with SparseDoubleVector

use of de.lmu.ifi.dbs.elki.data.SparseDoubleVector in project elki by elki-project.

the class ArffParser method setupBundleHeaders.

/**
 * Setup the headers for the object bundle.
 *
 * @param names Attribute names
 * @param targ Target columns
 * @param etyp ELKI type information
 * @param dimsize Number of dimensions in the individual types
 * @param bundle Output bundle
 * @param sparse Flag to create sparse vectors
 */
private void setupBundleHeaders(ArrayList<String> names, int[] targ, TypeInformation[] etyp, int[] dimsize, MultipleObjectsBundle bundle, boolean sparse) {
    for (int in = 0, out = 0; in < targ.length; out++) {
        int nin = in + 1;
        for (; nin < targ.length; nin++) {
            if (targ[nin] != targ[in]) {
                break;
            }
        }
        if (TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[out])) {
            String[] labels = new String[dimsize[out]];
            // Collect labels:
            for (int i = 0; i < dimsize[out]; i++) {
                labels[i] = names.get(out + i);
            }
            if (!sparse) {
                VectorFieldTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, dimsize[out], labels);
                bundle.appendColumn(type, new ArrayList<DoubleVector>());
            } else {
                VectorFieldTypeInformation<SparseDoubleVector> type = new VectorFieldTypeInformation<>(SparseDoubleVector.FACTORY, dimsize[out], labels);
                bundle.appendColumn(type, new ArrayList<SparseDoubleVector>());
            }
        } else if (TypeUtil.LABELLIST.equals(etyp[out])) {
            StringBuilder label = new StringBuilder(names.get(out));
            for (int i = 1; i < dimsize[out]; i++) {
                label.append(' ').append(names.get(out + i));
            }
            bundle.appendColumn(new SimpleTypeInformation<>(LabelList.class, label.toString()), new ArrayList<LabelList>());
        } else if (TypeUtil.EXTERNALID.equals(etyp[out])) {
            bundle.appendColumn(new SimpleTypeInformation<>(ExternalID.class, names.get(out)), new ArrayList<ExternalID>());
        } else if (TypeUtil.CLASSLABEL.equals(etyp[out])) {
            bundle.appendColumn(new SimpleTypeInformation<>(ClassLabel.class, names.get(out)), new ArrayList<ClassLabel>());
        } else {
            throw new AbortException("Unsupported type for column " + in + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null"));
        }
        assert (out == bundle.metaLength() - 1);
        in = nin;
    }
}
Also used : ExternalID(de.lmu.ifi.dbs.elki.data.ExternalID) ArrayList(java.util.ArrayList) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) SparseDoubleVector(de.lmu.ifi.dbs.elki.data.SparseDoubleVector) VectorFieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation) DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector) SparseDoubleVector(de.lmu.ifi.dbs.elki.data.SparseDoubleVector) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 2 with SparseDoubleVector

use of de.lmu.ifi.dbs.elki.data.SparseDoubleVector in project elki by elki-project.

the class ArffParser method loadSparseInstance.

private Object[] loadSparseInstance(StreamTokenizer tokenizer, int[] targ, int[] dimsize, TypeInformation[] elkitypes, int metaLength) throws IOException {
    Int2ObjectOpenHashMap<Object> map = new Int2ObjectOpenHashMap<>();
    while (true) {
        nextToken(tokenizer);
        assert (tokenizer.ttype != StreamTokenizer.TT_EOF && tokenizer.ttype != StreamTokenizer.TT_EOL);
        if (tokenizer.ttype == '}') {
            nextToken(tokenizer);
            assert (tokenizer.ttype == StreamTokenizer.TT_EOF || tokenizer.ttype == StreamTokenizer.TT_EOL);
            break;
        } else {
            // sparse token
            if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
                throw new AbortException("Unexpected token type encountered: " + tokenizer.toString() + " type: " + tokenizer.ttype);
            }
            int dim = ParseUtil.parseIntBase10(tokenizer.sval);
            if (map.containsKey(dim)) {
                throw new AbortException("Duplicate key in sparse vector: " + tokenizer.toString());
            }
            nextToken(tokenizer);
            if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
                map.put(dim, // 
                TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[targ[dim]]) ? (Double) ParseUtil.parseDouble(tokenizer.sval) : tokenizer.sval);
            } else {
                throw new AbortException("Unexpected token type encountered: " + tokenizer.toString());
            }
        }
    }
    Object[] data = new Object[metaLength];
    for (int out = 0; out < metaLength; out++) {
        // Find the first index
        int s = -1;
        for (int i = 0; i < targ.length; i++) {
            if (targ[i] == out && s < 0) {
                s = i;
                break;
            }
        }
        assert (s >= 0);
        if (TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[out])) {
            Int2DoubleOpenHashMap f = new Int2DoubleOpenHashMap(dimsize[out]);
            for (ObjectIterator<Int2ObjectMap.Entry<Object>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
                Int2ObjectMap.Entry<Object> entry = iter.next();
                int i = entry.getIntKey();
                if (i < s || i >= s + dimsize[out]) {
                    continue;
                }
                double v = ((Double) entry.getValue()).doubleValue();
                f.put(i - s, v);
            }
            data[out] = new SparseDoubleVector(f, dimsize[out]);
        } else if (TypeUtil.LABELLIST.equals(elkitypes[out])) {
            // Build a label list out of successive labels
            labels.clear();
            for (ObjectIterator<Int2ObjectMap.Entry<Object>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
                Int2ObjectMap.Entry<Object> entry = iter.next();
                int i = entry.getIntKey();
                if (i < s) {
                    continue;
                }
                if (i >= s + dimsize[out]) {
                    break;
                }
                if (labels.size() < i - s) {
                    LOG.warning("Sparse consecutive labels are currently not correctly supported.");
                }
                labels.add((String) entry.getValue());
            }
            data[out] = LabelList.make(labels);
        } else if (TypeUtil.EXTERNALID.equals(elkitypes[out])) {
            String val = (String) map.get(s);
            if (val == null) {
                throw new AbortException("External ID column not set in sparse instance." + tokenizer.toString());
            }
            data[out] = new ExternalID(val);
        } else if (TypeUtil.CLASSLABEL.equals(elkitypes[out])) {
            Object val = map.get(s);
            if (val == null) {
                throw new AbortException("Class label column not set in sparse instance." + tokenizer.toString());
            }
            // TODO: support other class label types.
            ClassLabel lbl = new SimpleClassLabel(String.valueOf(val));
            data[out] = lbl;
        } else {
            throw new AbortException("Unsupported type for column " + "->" + out + ": " + ((elkitypes[out] != null) ? elkitypes[out].toString() : "null"));
        }
    }
    return data;
}
Also used : Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) ExternalID(de.lmu.ifi.dbs.elki.data.ExternalID) Int2ObjectMap(it.unimi.dsi.fastutil.ints.Int2ObjectMap) SimpleClassLabel(de.lmu.ifi.dbs.elki.data.SimpleClassLabel) SparseDoubleVector(de.lmu.ifi.dbs.elki.data.SparseDoubleVector) ObjectIterator(it.unimi.dsi.fastutil.objects.ObjectIterator) SimpleClassLabel(de.lmu.ifi.dbs.elki.data.SimpleClassLabel) ClassLabel(de.lmu.ifi.dbs.elki.data.ClassLabel) Int2DoubleOpenHashMap(it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 3 with SparseDoubleVector

use of de.lmu.ifi.dbs.elki.data.SparseDoubleVector in project elki by elki-project.

the class TermFrequencyParserTest method testDBLPData.

@Test
public void testDBLPData() throws IOException {
    InputStream is = AbstractSimpleAlgorithmTest.open(DBLP_DATA);
    // Setup parser and data loading
    TermFrequencyParser<SparseDoubleVector> parser = new TermFrequencyParser<>(false, SparseDoubleVector.FACTORY);
    InputStreamDatabaseConnection dbc = new InputStreamDatabaseConnection(is, null, parser);
    ListParameterization config = new ListParameterization();
    config.addParameter(AbstractDatabase.Parameterizer.DATABASE_CONNECTION_ID, dbc);
    Database db = ClassGenericsUtil.parameterizeOrAbort(StaticArrayDatabase.class, config);
    if (config.hasUnusedParameters()) {
        fail("Unused parameters: " + config.getRemainingParameters());
    }
    if (config.hasErrors()) {
        config.logAndClearReportedErrors();
        fail("Parameterization errors.");
    }
    db.initialize();
    Relation<SparseNumberVector> rel = db.getRelation(TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH);
    // Get first three objects:
    DBIDIter iter = rel.iterDBIDs();
    SparseNumberVector v1 = rel.get(iter);
    iter.advance();
    SparseNumberVector v2 = rel.get(iter);
    iter.advance();
    SparseNumberVector v3 = rel.get(iter);
    // "Dense" euclidean distance:
    double euclid1_12 = EuclideanDistanceFunction.STATIC.distance(v1, v2);
    double euclid1_13 = EuclideanDistanceFunction.STATIC.distance(v1, v3);
    double euclid1_23 = EuclideanDistanceFunction.STATIC.distance(v2, v3);
    double euclid1_21 = EuclideanDistanceFunction.STATIC.distance(v2, v1);
    // Sparse euclidean distance:
    double euclid2_12 = SparseEuclideanDistanceFunction.STATIC.distance(v1, v2);
    double euclid2_13 = SparseEuclideanDistanceFunction.STATIC.distance(v1, v3);
    double euclid2_23 = SparseEuclideanDistanceFunction.STATIC.distance(v2, v3);
    double euclid2_21 = SparseEuclideanDistanceFunction.STATIC.distance(v2, v1);
    // (Auto-switching) angular distance:
    double arccos_12 = ArcCosineDistanceFunction.STATIC.distance(v1, v2);
    double arccos_13 = ArcCosineDistanceFunction.STATIC.distance(v1, v3);
    double arccos_23 = ArcCosineDistanceFunction.STATIC.distance(v2, v3);
    double arccos_21 = ArcCosineDistanceFunction.STATIC.distance(v2, v1);
    assertEquals("Euclidean self-distance is not 0.", 0., EuclideanDistanceFunction.STATIC.distance(v1, v1), Double.MIN_VALUE);
    assertEquals("Sparse Euclidean self-distance is not 0.", 0., SparseEuclideanDistanceFunction.STATIC.distance(v1, v1), Double.MIN_VALUE);
    assertEquals("Arccos self-distance is not 0.", 0., ArcCosineDistanceFunction.STATIC.distance(v1, v1), Double.MIN_VALUE);
    assertEquals("Euclidean distance not symmetric.", euclid1_12, euclid1_21, Double.MIN_VALUE);
    assertEquals("Sparse Euclidean distance not symmetric.", euclid2_12, euclid2_21, Double.MIN_VALUE);
    assertEquals("Arccos distance not symmetric.", arccos_12, arccos_21, Double.MIN_VALUE);
    assertEquals("Euclidean distance 1-2 not as expected.", 684.4165398352088, euclid1_12, 1e-20);
    assertEquals("Sparse Euclidean distance 1-2 not as expected.", 684.4165398352088, euclid2_12, 1e-20);
    assertEquals("Arccos distance 1-2 not as expected.", 0.1901934493141418, arccos_12, 1e-20);
    assertEquals("Euclidean distance 1-3 not as expected.", 654.9862593978594, euclid1_13, 1e-20);
    assertEquals("Sparse Euclidean distance 1-3 not as expected.", 654.9862593978594, euclid2_13, 1e-20);
    assertEquals("Arccos distance 1-3 not as expected.", 0.18654347641726046, arccos_13, 1e-20);
    assertEquals("Euclidean distance 2-3 not as expected.", 231.78653972998518, euclid1_23, 1e-20);
    assertEquals("Sparse Euclidean distance 2-3 not as expected.", 231.78653972998518, euclid2_23, 1e-20);
    assertEquals("Arccos distance 2-3 not as expected.", 0.11138352337990569, arccos_23, 1e-20);
}
Also used : InputStream(java.io.InputStream) Database(de.lmu.ifi.dbs.elki.database.Database) AbstractDatabase(de.lmu.ifi.dbs.elki.database.AbstractDatabase) StaticArrayDatabase(de.lmu.ifi.dbs.elki.database.StaticArrayDatabase) SparseNumberVector(de.lmu.ifi.dbs.elki.data.SparseNumberVector) InputStreamDatabaseConnection(de.lmu.ifi.dbs.elki.datasource.InputStreamDatabaseConnection) SparseDoubleVector(de.lmu.ifi.dbs.elki.data.SparseDoubleVector) ListParameterization(de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) Test(org.junit.Test) AbstractSimpleAlgorithmTest(de.lmu.ifi.dbs.elki.algorithm.AbstractSimpleAlgorithmTest)

Aggregations

SparseDoubleVector (de.lmu.ifi.dbs.elki.data.SparseDoubleVector)3 ExternalID (de.lmu.ifi.dbs.elki.data.ExternalID)2 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)2 AbstractSimpleAlgorithmTest (de.lmu.ifi.dbs.elki.algorithm.AbstractSimpleAlgorithmTest)1 ClassLabel (de.lmu.ifi.dbs.elki.data.ClassLabel)1 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)1 SimpleClassLabel (de.lmu.ifi.dbs.elki.data.SimpleClassLabel)1 SparseNumberVector (de.lmu.ifi.dbs.elki.data.SparseNumberVector)1 SimpleTypeInformation (de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation)1 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)1 AbstractDatabase (de.lmu.ifi.dbs.elki.database.AbstractDatabase)1 Database (de.lmu.ifi.dbs.elki.database.Database)1 StaticArrayDatabase (de.lmu.ifi.dbs.elki.database.StaticArrayDatabase)1 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)1 InputStreamDatabaseConnection (de.lmu.ifi.dbs.elki.datasource.InputStreamDatabaseConnection)1 ListParameterization (de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization)1 Int2DoubleOpenHashMap (it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap)1 Int2ObjectMap (it.unimi.dsi.fastutil.ints.Int2ObjectMap)1 Int2ObjectOpenHashMap (it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap)1 ObjectIterator (it.unimi.dsi.fastutil.objects.ObjectIterator)1