use of de.lmu.ifi.dbs.elki.data.SparseDoubleVector in project elki by elki-project.
the class ArffParser method setupBundleHeaders.
/**
* Setup the headers for the object bundle.
*
* @param names Attribute names
* @param targ Target columns
* @param etyp ELKI type information
* @param dimsize Number of dimensions in the individual types
* @param bundle Output bundle
* @param sparse Flag to create sparse vectors
*/
private void setupBundleHeaders(ArrayList<String> names, int[] targ, TypeInformation[] etyp, int[] dimsize, MultipleObjectsBundle bundle, boolean sparse) {
for (int in = 0, out = 0; in < targ.length; out++) {
int nin = in + 1;
for (; nin < targ.length; nin++) {
if (targ[nin] != targ[in]) {
break;
}
}
if (TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[out])) {
String[] labels = new String[dimsize[out]];
// Collect labels:
for (int i = 0; i < dimsize[out]; i++) {
labels[i] = names.get(out + i);
}
if (!sparse) {
VectorFieldTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, dimsize[out], labels);
bundle.appendColumn(type, new ArrayList<DoubleVector>());
} else {
VectorFieldTypeInformation<SparseDoubleVector> type = new VectorFieldTypeInformation<>(SparseDoubleVector.FACTORY, dimsize[out], labels);
bundle.appendColumn(type, new ArrayList<SparseDoubleVector>());
}
} else if (TypeUtil.LABELLIST.equals(etyp[out])) {
StringBuilder label = new StringBuilder(names.get(out));
for (int i = 1; i < dimsize[out]; i++) {
label.append(' ').append(names.get(out + i));
}
bundle.appendColumn(new SimpleTypeInformation<>(LabelList.class, label.toString()), new ArrayList<LabelList>());
} else if (TypeUtil.EXTERNALID.equals(etyp[out])) {
bundle.appendColumn(new SimpleTypeInformation<>(ExternalID.class, names.get(out)), new ArrayList<ExternalID>());
} else if (TypeUtil.CLASSLABEL.equals(etyp[out])) {
bundle.appendColumn(new SimpleTypeInformation<>(ClassLabel.class, names.get(out)), new ArrayList<ClassLabel>());
} else {
throw new AbortException("Unsupported type for column " + in + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null"));
}
assert (out == bundle.metaLength() - 1);
in = nin;
}
}
use of de.lmu.ifi.dbs.elki.data.SparseDoubleVector in project elki by elki-project.
the class ArffParser method loadSparseInstance.
private Object[] loadSparseInstance(StreamTokenizer tokenizer, int[] targ, int[] dimsize, TypeInformation[] elkitypes, int metaLength) throws IOException {
Int2ObjectOpenHashMap<Object> map = new Int2ObjectOpenHashMap<>();
while (true) {
nextToken(tokenizer);
assert (tokenizer.ttype != StreamTokenizer.TT_EOF && tokenizer.ttype != StreamTokenizer.TT_EOL);
if (tokenizer.ttype == '}') {
nextToken(tokenizer);
assert (tokenizer.ttype == StreamTokenizer.TT_EOF || tokenizer.ttype == StreamTokenizer.TT_EOL);
break;
} else {
// sparse token
if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
throw new AbortException("Unexpected token type encountered: " + tokenizer.toString() + " type: " + tokenizer.ttype);
}
int dim = ParseUtil.parseIntBase10(tokenizer.sval);
if (map.containsKey(dim)) {
throw new AbortException("Duplicate key in sparse vector: " + tokenizer.toString());
}
nextToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
map.put(dim, //
TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[targ[dim]]) ? (Double) ParseUtil.parseDouble(tokenizer.sval) : tokenizer.sval);
} else {
throw new AbortException("Unexpected token type encountered: " + tokenizer.toString());
}
}
}
Object[] data = new Object[metaLength];
for (int out = 0; out < metaLength; out++) {
// Find the first index
int s = -1;
for (int i = 0; i < targ.length; i++) {
if (targ[i] == out && s < 0) {
s = i;
break;
}
}
assert (s >= 0);
if (TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[out])) {
Int2DoubleOpenHashMap f = new Int2DoubleOpenHashMap(dimsize[out]);
for (ObjectIterator<Int2ObjectMap.Entry<Object>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
Int2ObjectMap.Entry<Object> entry = iter.next();
int i = entry.getIntKey();
if (i < s || i >= s + dimsize[out]) {
continue;
}
double v = ((Double) entry.getValue()).doubleValue();
f.put(i - s, v);
}
data[out] = new SparseDoubleVector(f, dimsize[out]);
} else if (TypeUtil.LABELLIST.equals(elkitypes[out])) {
// Build a label list out of successive labels
labels.clear();
for (ObjectIterator<Int2ObjectMap.Entry<Object>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
Int2ObjectMap.Entry<Object> entry = iter.next();
int i = entry.getIntKey();
if (i < s) {
continue;
}
if (i >= s + dimsize[out]) {
break;
}
if (labels.size() < i - s) {
LOG.warning("Sparse consecutive labels are currently not correctly supported.");
}
labels.add((String) entry.getValue());
}
data[out] = LabelList.make(labels);
} else if (TypeUtil.EXTERNALID.equals(elkitypes[out])) {
String val = (String) map.get(s);
if (val == null) {
throw new AbortException("External ID column not set in sparse instance." + tokenizer.toString());
}
data[out] = new ExternalID(val);
} else if (TypeUtil.CLASSLABEL.equals(elkitypes[out])) {
Object val = map.get(s);
if (val == null) {
throw new AbortException("Class label column not set in sparse instance." + tokenizer.toString());
}
// TODO: support other class label types.
ClassLabel lbl = new SimpleClassLabel(String.valueOf(val));
data[out] = lbl;
} else {
throw new AbortException("Unsupported type for column " + "->" + out + ": " + ((elkitypes[out] != null) ? elkitypes[out].toString() : "null"));
}
}
return data;
}
use of de.lmu.ifi.dbs.elki.data.SparseDoubleVector in project elki by elki-project.
the class TermFrequencyParserTest method testDBLPData.
@Test
public void testDBLPData() throws IOException {
InputStream is = AbstractSimpleAlgorithmTest.open(DBLP_DATA);
// Setup parser and data loading
TermFrequencyParser<SparseDoubleVector> parser = new TermFrequencyParser<>(false, SparseDoubleVector.FACTORY);
InputStreamDatabaseConnection dbc = new InputStreamDatabaseConnection(is, null, parser);
ListParameterization config = new ListParameterization();
config.addParameter(AbstractDatabase.Parameterizer.DATABASE_CONNECTION_ID, dbc);
Database db = ClassGenericsUtil.parameterizeOrAbort(StaticArrayDatabase.class, config);
if (config.hasUnusedParameters()) {
fail("Unused parameters: " + config.getRemainingParameters());
}
if (config.hasErrors()) {
config.logAndClearReportedErrors();
fail("Parameterization errors.");
}
db.initialize();
Relation<SparseNumberVector> rel = db.getRelation(TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH);
// Get first three objects:
DBIDIter iter = rel.iterDBIDs();
SparseNumberVector v1 = rel.get(iter);
iter.advance();
SparseNumberVector v2 = rel.get(iter);
iter.advance();
SparseNumberVector v3 = rel.get(iter);
// "Dense" euclidean distance:
double euclid1_12 = EuclideanDistanceFunction.STATIC.distance(v1, v2);
double euclid1_13 = EuclideanDistanceFunction.STATIC.distance(v1, v3);
double euclid1_23 = EuclideanDistanceFunction.STATIC.distance(v2, v3);
double euclid1_21 = EuclideanDistanceFunction.STATIC.distance(v2, v1);
// Sparse euclidean distance:
double euclid2_12 = SparseEuclideanDistanceFunction.STATIC.distance(v1, v2);
double euclid2_13 = SparseEuclideanDistanceFunction.STATIC.distance(v1, v3);
double euclid2_23 = SparseEuclideanDistanceFunction.STATIC.distance(v2, v3);
double euclid2_21 = SparseEuclideanDistanceFunction.STATIC.distance(v2, v1);
// (Auto-switching) angular distance:
double arccos_12 = ArcCosineDistanceFunction.STATIC.distance(v1, v2);
double arccos_13 = ArcCosineDistanceFunction.STATIC.distance(v1, v3);
double arccos_23 = ArcCosineDistanceFunction.STATIC.distance(v2, v3);
double arccos_21 = ArcCosineDistanceFunction.STATIC.distance(v2, v1);
assertEquals("Euclidean self-distance is not 0.", 0., EuclideanDistanceFunction.STATIC.distance(v1, v1), Double.MIN_VALUE);
assertEquals("Sparse Euclidean self-distance is not 0.", 0., SparseEuclideanDistanceFunction.STATIC.distance(v1, v1), Double.MIN_VALUE);
assertEquals("Arccos self-distance is not 0.", 0., ArcCosineDistanceFunction.STATIC.distance(v1, v1), Double.MIN_VALUE);
assertEquals("Euclidean distance not symmetric.", euclid1_12, euclid1_21, Double.MIN_VALUE);
assertEquals("Sparse Euclidean distance not symmetric.", euclid2_12, euclid2_21, Double.MIN_VALUE);
assertEquals("Arccos distance not symmetric.", arccos_12, arccos_21, Double.MIN_VALUE);
assertEquals("Euclidean distance 1-2 not as expected.", 684.4165398352088, euclid1_12, 1e-20);
assertEquals("Sparse Euclidean distance 1-2 not as expected.", 684.4165398352088, euclid2_12, 1e-20);
assertEquals("Arccos distance 1-2 not as expected.", 0.1901934493141418, arccos_12, 1e-20);
assertEquals("Euclidean distance 1-3 not as expected.", 654.9862593978594, euclid1_13, 1e-20);
assertEquals("Sparse Euclidean distance 1-3 not as expected.", 654.9862593978594, euclid2_13, 1e-20);
assertEquals("Arccos distance 1-3 not as expected.", 0.18654347641726046, arccos_13, 1e-20);
assertEquals("Euclidean distance 2-3 not as expected.", 231.78653972998518, euclid1_23, 1e-20);
assertEquals("Sparse Euclidean distance 2-3 not as expected.", 231.78653972998518, euclid2_23, 1e-20);
assertEquals("Arccos distance 2-3 not as expected.", 0.11138352337990569, arccos_23, 1e-20);
}
Aggregations