use of de.lmu.ifi.dbs.elki.data.ExternalID in project elki by elki-project.
the class ExternalIDFilter method filter.
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
// Find a labellist column
boolean done = false;
boolean keeplabelcol = false;
for (int i = 0; i < objects.metaLength(); i++) {
SimpleTypeInformation<?> meta = objects.meta(i);
// Skip non-labellist columns - or if we already had a labellist
if (done || !LabelList.class.equals(meta.getRestrictionClass())) {
bundle.appendColumn(meta, objects.getColumn(i));
continue;
}
done = true;
// We split the label column into two parts
List<ExternalID> eidcol = new ArrayList<>(objects.dataLength());
List<LabelList> lblcol = new ArrayList<>(objects.dataLength());
// Split the column
ArrayList<String> lbuf = new ArrayList<>();
for (Object obj : objects.getColumn(i)) {
if (obj != null) {
LabelList ll = (LabelList) obj;
int off = externalIdIndex >= 0 ? externalIdIndex : (ll.size() - externalIdIndex);
eidcol.add(new ExternalID(ll.get(off)));
lbuf.clear();
for (int j = 0; j < ll.size(); j++) {
if (j == off) {
continue;
}
lbuf.add(ll.get(j));
}
lblcol.add(LabelList.make(lbuf));
if (ll.size() > 0) {
keeplabelcol = true;
}
} else {
eidcol.add(null);
lblcol.add(null);
}
}
bundle.appendColumn(TypeUtil.EXTERNALID, eidcol);
// Only add the label column when it's not empty.
if (keeplabelcol) {
bundle.appendColumn(meta, lblcol);
}
}
return bundle;
}
use of de.lmu.ifi.dbs.elki.data.ExternalID in project elki by elki-project.
the class ArffParser method loadDenseInstance.
private Object[] loadDenseInstance(StreamTokenizer tokenizer, int[] dimsize, TypeInformation[] etyp, int outdim) throws IOException {
Object[] data = new Object[outdim];
for (int out = 0; out < outdim; out++) {
if (TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[out])) {
// For multi-column vectors, read successive columns
double[] cur = new double[dimsize[out]];
for (int k = 0; k < dimsize[out]; k++) {
if (tokenizer.ttype == '?') {
cur[k] = Double.NaN;
} else if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
try {
cur[k] = ParseUtil.parseDouble(tokenizer.sval);
} catch (NumberFormatException e) {
throw new AbortException("Expected number value, got: " + tokenizer.sval);
}
} else {
throw new AbortException("Expected word token, got: " + tokenizer.toString());
}
nextToken(tokenizer);
}
data[out] = denseFactory.newNumberVector(cur);
} else if (TypeUtil.LABELLIST.equals(etyp[out])) {
// Build a label list out of successive labels
labels.clear();
for (int k = 0; k < dimsize[out]; k++) {
if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
throw new AbortException("Expected word token, got: " + tokenizer.toString());
}
labels.add(tokenizer.sval);
nextToken(tokenizer);
}
data[out] = LabelList.make(labels);
} else if (TypeUtil.EXTERNALID.equals(etyp[out])) {
if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
throw new AbortException("Expected word token, got: " + tokenizer.toString());
}
data[out] = new ExternalID(tokenizer.sval);
nextToken(tokenizer);
} else if (TypeUtil.CLASSLABEL.equals(etyp[out])) {
if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
throw new AbortException("Expected word token, got: " + tokenizer.toString());
}
// TODO: support other class label types.
ClassLabel lbl = new SimpleClassLabel(tokenizer.sval);
data[out] = lbl;
nextToken(tokenizer);
} else {
throw new AbortException("Unsupported type for column " + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null"));
}
}
return data;
}
use of de.lmu.ifi.dbs.elki.data.ExternalID in project elki by elki-project.
the class ArffParser method setupBundleHeaders.
/**
* Setup the headers for the object bundle.
*
* @param names Attribute names
* @param targ Target columns
* @param etyp ELKI type information
* @param dimsize Number of dimensions in the individual types
* @param bundle Output bundle
* @param sparse Flag to create sparse vectors
*/
private void setupBundleHeaders(ArrayList<String> names, int[] targ, TypeInformation[] etyp, int[] dimsize, MultipleObjectsBundle bundle, boolean sparse) {
for (int in = 0, out = 0; in < targ.length; out++) {
int nin = in + 1;
for (; nin < targ.length; nin++) {
if (targ[nin] != targ[in]) {
break;
}
}
if (TypeUtil.NUMBER_VECTOR_FIELD.equals(etyp[out])) {
String[] labels = new String[dimsize[out]];
// Collect labels:
for (int i = 0; i < dimsize[out]; i++) {
labels[i] = names.get(out + i);
}
if (!sparse) {
VectorFieldTypeInformation<DoubleVector> type = new VectorFieldTypeInformation<>(DoubleVector.FACTORY, dimsize[out], labels);
bundle.appendColumn(type, new ArrayList<DoubleVector>());
} else {
VectorFieldTypeInformation<SparseDoubleVector> type = new VectorFieldTypeInformation<>(SparseDoubleVector.FACTORY, dimsize[out], labels);
bundle.appendColumn(type, new ArrayList<SparseDoubleVector>());
}
} else if (TypeUtil.LABELLIST.equals(etyp[out])) {
StringBuilder label = new StringBuilder(names.get(out));
for (int i = 1; i < dimsize[out]; i++) {
label.append(' ').append(names.get(out + i));
}
bundle.appendColumn(new SimpleTypeInformation<>(LabelList.class, label.toString()), new ArrayList<LabelList>());
} else if (TypeUtil.EXTERNALID.equals(etyp[out])) {
bundle.appendColumn(new SimpleTypeInformation<>(ExternalID.class, names.get(out)), new ArrayList<ExternalID>());
} else if (TypeUtil.CLASSLABEL.equals(etyp[out])) {
bundle.appendColumn(new SimpleTypeInformation<>(ClassLabel.class, names.get(out)), new ArrayList<ClassLabel>());
} else {
throw new AbortException("Unsupported type for column " + in + "->" + out + ": " + ((etyp[out] != null) ? etyp[out].toString() : "null"));
}
assert (out == bundle.metaLength() - 1);
in = nin;
}
}
use of de.lmu.ifi.dbs.elki.data.ExternalID in project elki by elki-project.
the class ArffParser method loadSparseInstance.
private Object[] loadSparseInstance(StreamTokenizer tokenizer, int[] targ, int[] dimsize, TypeInformation[] elkitypes, int metaLength) throws IOException {
Int2ObjectOpenHashMap<Object> map = new Int2ObjectOpenHashMap<>();
while (true) {
nextToken(tokenizer);
assert (tokenizer.ttype != StreamTokenizer.TT_EOF && tokenizer.ttype != StreamTokenizer.TT_EOL);
if (tokenizer.ttype == '}') {
nextToken(tokenizer);
assert (tokenizer.ttype == StreamTokenizer.TT_EOF || tokenizer.ttype == StreamTokenizer.TT_EOL);
break;
} else {
// sparse token
if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
throw new AbortException("Unexpected token type encountered: " + tokenizer.toString() + " type: " + tokenizer.ttype);
}
int dim = ParseUtil.parseIntBase10(tokenizer.sval);
if (map.containsKey(dim)) {
throw new AbortException("Duplicate key in sparse vector: " + tokenizer.toString());
}
nextToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
map.put(dim, //
TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[targ[dim]]) ? (Double) ParseUtil.parseDouble(tokenizer.sval) : tokenizer.sval);
} else {
throw new AbortException("Unexpected token type encountered: " + tokenizer.toString());
}
}
}
Object[] data = new Object[metaLength];
for (int out = 0; out < metaLength; out++) {
// Find the first index
int s = -1;
for (int i = 0; i < targ.length; i++) {
if (targ[i] == out && s < 0) {
s = i;
break;
}
}
assert (s >= 0);
if (TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[out])) {
Int2DoubleOpenHashMap f = new Int2DoubleOpenHashMap(dimsize[out]);
for (ObjectIterator<Int2ObjectMap.Entry<Object>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
Int2ObjectMap.Entry<Object> entry = iter.next();
int i = entry.getIntKey();
if (i < s || i >= s + dimsize[out]) {
continue;
}
double v = ((Double) entry.getValue()).doubleValue();
f.put(i - s, v);
}
data[out] = new SparseDoubleVector(f, dimsize[out]);
} else if (TypeUtil.LABELLIST.equals(elkitypes[out])) {
// Build a label list out of successive labels
labels.clear();
for (ObjectIterator<Int2ObjectMap.Entry<Object>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
Int2ObjectMap.Entry<Object> entry = iter.next();
int i = entry.getIntKey();
if (i < s) {
continue;
}
if (i >= s + dimsize[out]) {
break;
}
if (labels.size() < i - s) {
LOG.warning("Sparse consecutive labels are currently not correctly supported.");
}
labels.add((String) entry.getValue());
}
data[out] = LabelList.make(labels);
} else if (TypeUtil.EXTERNALID.equals(elkitypes[out])) {
String val = (String) map.get(s);
if (val == null) {
throw new AbortException("External ID column not set in sparse instance." + tokenizer.toString());
}
data[out] = new ExternalID(val);
} else if (TypeUtil.CLASSLABEL.equals(elkitypes[out])) {
Object val = map.get(s);
if (val == null) {
throw new AbortException("Class label column not set in sparse instance." + tokenizer.toString());
}
// TODO: support other class label types.
ClassLabel lbl = new SimpleClassLabel(String.valueOf(val));
data[out] = lbl;
} else {
throw new AbortException("Unsupported type for column " + "->" + out + ": " + ((elkitypes[out] != null) ? elkitypes[out].toString() : "null"));
}
}
return data;
}
use of de.lmu.ifi.dbs.elki.data.ExternalID in project elki by elki-project.
the class ExternalIDJoinDatabaseConnection method loadData.
@Override
public MultipleObjectsBundle loadData() {
List<MultipleObjectsBundle> bundles = new ArrayList<>(sources.size());
for (DatabaseConnection dbc : sources) {
bundles.add(dbc.loadData());
}
MultipleObjectsBundle first = bundles.get(0);
Object2IntOpenHashMap<ExternalID> labelmap = new Object2IntOpenHashMap<>(first.dataLength());
labelmap.defaultReturnValue(-1);
// Process first bundle
{
// Identify a label column
final int lblcol;
{
int lblc = -1;
for (int i = 0; i < first.metaLength(); i++) {
if (TypeUtil.EXTERNALID.isAssignableFromType(first.meta(i))) {
lblc = i;
break;
}
}
// make static
lblcol = lblc;
}
if (lblcol == -1) {
throw new AbortException("No external ID column found in primary source.");
}
for (int i = 0; i < first.dataLength(); i++) {
ExternalID data = (ExternalID) first.data(i, lblcol);
if (data == null) {
LOG.debug("Object without ID encountered.");
continue;
}
int old = labelmap.put(data, i);
if (old != -1) {
LOG.debug("Duplicate id encountered: " + data + " in rows " + old + " and " + i);
}
}
}
// Process additional columns
for (int c = 1; c < sources.size(); c++) {
MultipleObjectsBundle cur = bundles.get(c);
final int lblcol;
{
int lblc = -1;
for (int i = 0; i < cur.metaLength(); i++) {
if (TypeUtil.EXTERNALID.isAssignableFromType(cur.meta(i))) {
lblc = i;
break;
}
}
// make static
lblcol = lblc;
}
if (lblcol == -1) {
StringBuilder buf = new StringBuilder();
for (int i = 0; i < cur.metaLength(); i++) {
if (buf.length() > 0) {
buf.append(',');
}
buf.append(cur.meta(i));
}
throw new AbortException("No external ID column found in source " + (c + 1) + " to join with. Got: " + buf.toString());
}
// Destination columns
List<ArrayList<Object>> dcol = new ArrayList<>(cur.metaLength());
for (int i = 0; i < cur.metaLength(); i++) {
// Skip the label columns
if (i == lblcol) {
dcol.add(null);
continue;
}
ArrayList<Object> newcol = new ArrayList<>(first.dataLength());
// Pre-fill with nulls.
for (int j = 0; j < first.dataLength(); j++) {
newcol.add(null);
}
first.appendColumn(cur.meta(i), newcol);
dcol.add(newcol);
}
for (int i = 0; i < cur.dataLength(); i++) {
ExternalID data = (ExternalID) cur.data(i, lblcol);
if (data == null) {
LOG.warning("Object without label encountered.");
continue;
}
int row = labelmap.getInt(data);
if (row == -1) {
LOG.debug("ID not found for join: " + data + " in row " + i);
continue;
}
for (int d = 0; d < cur.metaLength(); d++) {
if (d == lblcol) {
continue;
}
List<Object> col = dcol.get(d);
assert (col != null);
col.set(row, cur.data(i, d));
}
}
}
for (int i = 0; i < first.dataLength(); i++) {
for (int d = 0; d < first.metaLength(); d++) {
if (first.data(i, d) == null) {
StringBuilder buf = new StringBuilder();
for (int d2 = 0; d2 < first.metaLength(); d2++) {
if (buf.length() > 0) {
buf.append(", ");
}
if (first.data(i, d2) == null) {
buf.append("null");
} else {
buf.append(first.data(i, d2));
}
}
LOG.warning("null value in joined data, row " + i + " column " + d + FormatUtil.NEWLINE + "[" + buf.toString() + "]");
break;
}
}
}
return first;
}
Aggregations