use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class ShuffleObjectsFilterTest method defaultParameters.
/**
* Test with default parameters.
*/
@Test
public void defaultParameters() {
String filename = UNITTEST + "sorted-data-1.csv";
ShuffleObjectsFilter filter = new ELKIBuilder<>(ShuffleObjectsFilter.class).build();
MultipleObjectsBundle filteredBundle = readBundle(filename, filter);
// Load the test data again without a filter.
MultipleObjectsBundle unfilteredBundle = readBundle(filename);
// Ensure the first column are the vectors.
assertEquals("Dimensionality", getFieldDimensionality(unfilteredBundle, 0, TypeUtil.NUMBER_VECTOR_FIELD), getFieldDimensionality(filteredBundle, 0, TypeUtil.NUMBER_VECTOR_FIELD));
assertEquals("Length changed", unfilteredBundle.dataLength(), filteredBundle.dataLength());
// Verify that the elements of the unfiltered bundle are in sorted order.
double prev = get(unfilteredBundle, 0, 0, DoubleVector.class).doubleValue(0);
for (int row = 1; row < unfilteredBundle.dataLength(); row++) {
final double next = get(unfilteredBundle, row, 0, DoubleVector.class).doubleValue(0);
assertTrue("Values are expected to be in sorted order", prev <= next);
prev = next;
}
// Verify that the elements of the filtered bundle are not in sorted order.
// By verifying this, we can ascertain that the vectors have been shuffled.
prev = get(filteredBundle, 0, 0, DoubleVector.class).doubleValue(0);
boolean shuffled = false;
for (int row = 1; row < filteredBundle.dataLength(); row++) {
final double next = get(filteredBundle, row, 0, DoubleVector.class).doubleValue(0);
if (prev > next) {
shuffled = true;
break;
}
}
assertTrue("Elements are not shuffled.", shuffled);
}
use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class ShuffleObjectsFilter method filter.
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
if (LOG.isDebugging()) {
LOG.debug("Shuffling the data set");
}
final Random random = rnd.getSingleThreadedRandom();
final int size = objects.dataLength();
final int[] offsets = new int[size];
for (int i = 0; i < size; i++) {
offsets[i] = i;
}
// Randomize the offset array
for (int i = size; i > 1; i--) {
final int j = random.nextInt(i);
// Swap the elements at positions j and i - 1:
final int temp = offsets[j];
offsets[j] = offsets[i - 1];
offsets[i - 1] = temp;
}
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
for (int j = 0; j < objects.metaLength(); j++) {
// Reorder column accordingly
List<?> in = objects.getColumn(j);
List<Object> data = new ArrayList<>(size);
for (int i = 0; i < size; i++) {
data.add(in.get(offsets[i]));
}
bundle.appendColumn(objects.meta(j), data);
}
return bundle;
}
use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class AbstractSupervisedProjectionVectorFilter method filter.
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
final int dataLength = objects.dataLength();
if (dataLength == 0) {
return objects;
}
List<? extends ClassLabel> classcolumn = null;
// First of all, identify a class label column.
for (int r = 0; r < objects.metaLength(); r++) {
SimpleTypeInformation<?> type = objects.meta(r);
List<?> column = objects.getColumn(r);
if (TypeUtil.CLASSLABEL.isAssignableFromType(type)) {
@SuppressWarnings("unchecked") final List<? extends ClassLabel> castcolumn = (List<? extends ClassLabel>) column;
classcolumn = castcolumn;
break;
}
}
if (classcolumn == null) {
getLogger().warning("No class label column found (try " + ClassLabelFilter.class.getSimpleName() + ") -- cannot run " + this.getClass().getSimpleName());
return objects;
}
boolean somesuccess = false;
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
// Secondly, look for columns to train the projection on.
for (int r = 0; r < objects.metaLength(); r++) {
SimpleTypeInformation<?> type = objects.meta(r);
List<?> column = objects.getColumn(r);
if (!TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(type)) {
bundle.appendColumn(type, column);
continue;
}
@SuppressWarnings("unchecked") List<V> vectorcolumn = (List<V>) column;
final VectorFieldTypeInformation<?> vtype = (VectorFieldTypeInformation<?>) type;
@SuppressWarnings("unchecked") NumberVector.Factory<V> factory = (NumberVector.Factory<V>) vtype.getFactory();
int dim = vtype.getDimensionality();
if (tdim > dim) {
if (getLogger().isVerbose()) {
getLogger().verbose("Setting projection dimension to original dimension: projection dimension: " + tdim + " larger than original dimension: " + dim);
}
tdim = dim;
}
try {
double[][] proj = computeProjectionMatrix(vectorcolumn, classcolumn, dim);
for (int i = 0; i < dataLength; i++) {
double[] pv = times(proj, vectorcolumn.get(i).toArray());
vectorcolumn.set(i, factory.newNumberVector(pv));
}
bundle.appendColumn(convertedType(type, factory), column);
somesuccess = true;
} catch (Exception e) {
getLogger().error("Projection failed -- continuing with unprojected data!", e);
bundle.appendColumn(type, column);
continue;
}
}
if (!somesuccess) {
getLogger().warning("No vector field of fixed dimensionality found.");
return objects;
}
return bundle;
}
use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class ConcatenateFilesDatabaseConnection method loadData.
@Override
public MultipleObjectsBundle loadData() {
MultipleObjectsBundle objects = new MultipleObjectsBundle();
objects.appendColumn(TypeUtil.STRING, new ArrayList<>());
for (File file : files) {
String filestr = file.getPath();
try (InputStream inputStream = //
FileUtil.tryGzipInput(new BufferedInputStream(new FileInputStream(file)))) {
final BundleStreamSource source;
if (parser instanceof StreamingParser) {
final StreamingParser streamParser = (StreamingParser) parser;
streamParser.initStream(inputStream);
source = streamParser;
} else {
MultipleObjectsBundle parsingResult = parser.parse(inputStream);
// normalize objects and transform labels
source = parsingResult.asStream();
}
// NullPointerException on invalid streams
BundleMeta meta = null;
loop: for (Event e = source.nextEvent(); ; e = source.nextEvent()) {
switch(e) {
case END_OF_STREAM:
break loop;
case META_CHANGED:
meta = source.getMeta();
for (int i = 0; i < meta.size(); i++) {
if (i + 1 >= objects.metaLength()) {
objects.appendColumn(meta.get(i), new ArrayList<>());
} else {
// Ensure compatibility:
if (!objects.meta(i + 1).isAssignableFromType(meta.get(i))) {
throw new AbortException("Incompatible files loaded. Cannot concatenate with unaligned columns, please preprocess manually.");
}
}
}
// switch
break;
case NEXT_OBJECT:
Object[] o = new Object[objects.metaLength()];
o[0] = filestr;
for (int i = 0; i < meta.size(); i++) {
o[i + 1] = source.data(i);
}
objects.appendSimple(o);
// switch
break;
}
}
} catch (IOException e) {
throw new AbortException("Loading file " + filestr + " failed: " + e.toString(), e);
}
}
parser.cleanup();
// Invoke filters
if (LOG.isDebugging()) {
LOG.debugFine("Invoking filters.");
}
return invokeBundleFilters(objects);
}
use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class LabelJoinDatabaseConnection method loadData.
@Override
public MultipleObjectsBundle loadData() {
List<MultipleObjectsBundle> bundles = new ArrayList<>(sources.size());
for (DatabaseConnection dbc : sources) {
bundles.add(dbc.loadData());
}
MultipleObjectsBundle first = bundles.get(0);
Object2IntOpenHashMap<String> labelmap = new Object2IntOpenHashMap<>(first.dataLength());
labelmap.defaultReturnValue(-1);
// Process first bundle
{
// Identify a label column
final int lblcol = FilterUtil.findLabelColumn(first);
if (lblcol == -1) {
throw new AbortException("No label column found in first source, cannot join (do you want to use " + ExternalIDJoinDatabaseConnection.class.getSimpleName() + " instead?)");
}
for (int i = 0; i < first.dataLength(); i++) {
Object data = first.data(i, lblcol);
if (data == null) {
LOG.warning("Object without label encountered.");
continue;
}
if (data instanceof String) {
int old = labelmap.put((String) data, i);
if (old != -1) {
LOG.warning("Duplicate label encountered: " + data + " in rows " + old + " and " + i);
}
} else if (data instanceof LabelList) {
final LabelList ll = (LabelList) data;
for (int j = 0; j < ll.size(); j++) {
String lbl = ll.get(j);
int old = labelmap.put(lbl, i);
if (old != -1) {
LOG.warning("Duplicate label encountered: " + lbl + " in rows " + old + " and " + i);
}
}
} else {
String lbl = data.toString();
int old = labelmap.put(lbl, i);
if (old != -1) {
LOG.warning("Duplicate label encountered: " + lbl + " in rows " + old + " and " + i);
}
}
}
}
// Process additional columns
for (int c = 1; c < sources.size(); c++) {
MultipleObjectsBundle cur = bundles.get(c);
final int lblcol = FilterUtil.findLabelColumn(cur);
if (lblcol == -1) {
throw new AbortException("No label column found in source " + (c + 1) + ", cannot join (do you want to use " + ExternalIDJoinDatabaseConnection.class.getSimpleName() + " instead?)");
}
// Destination columns
List<ArrayList<Object>> dcol = new ArrayList<>(cur.metaLength());
for (int i = 0; i < cur.metaLength(); i++) {
// Skip the label columns
if (i == lblcol) {
dcol.add(null);
continue;
}
ArrayList<Object> newcol = new ArrayList<>(first.dataLength());
// Pre-fill with nulls.
for (int j = 0; j < first.dataLength(); j++) {
newcol.add(null);
}
first.appendColumn(cur.meta(i), newcol);
dcol.add(newcol);
}
for (int i = 0; i < cur.dataLength(); i++) {
Object data = cur.data(i, lblcol);
if (data == null) {
LOG.warning("Object without label encountered.");
continue;
}
int row = -1;
if (data instanceof String) {
row = labelmap.getInt(data);
} else if (data instanceof LabelList) {
final LabelList ll = (LabelList) data;
for (int j = 0; j < ll.size(); j++) {
row = labelmap.getInt(ll.get(j));
if (row >= 0) {
break;
}
}
} else {
row = labelmap.getInt(data.toString());
}
if (row < 0) {
LOG.warning("Label not found for join: " + data + " in row " + i);
continue;
}
for (int d = 0; d < cur.metaLength(); d++) {
if (d == lblcol) {
continue;
}
List<Object> col = dcol.get(d);
assert (col != null);
col.set(row, cur.data(i, d));
}
}
}
for (int i = 0; i < first.dataLength(); i++) {
for (int d = 0; d < first.metaLength(); d++) {
if (first.data(i, d) == null) {
StringBuilder buf = new StringBuilder();
for (int d2 = 0; d2 < first.metaLength(); d2++) {
if (buf.length() > 0) {
buf.append(", ");
}
if (first.data(i, d2) == null) {
buf.append("null");
} else {
buf.append(first.data(i, d2));
}
}
LOG.warning("null value in joined data, row " + i + " column " + d + FormatUtil.NEWLINE + "[" + buf.toString() + "]");
break;
}
}
}
return first;
}
Aggregations