use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class ExternalIDFilter method filter.
@Override
public MultipleObjectsBundle filter(MultipleObjectsBundle objects) {
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
// Find a labellist column
boolean done = false;
boolean keeplabelcol = false;
for (int i = 0; i < objects.metaLength(); i++) {
SimpleTypeInformation<?> meta = objects.meta(i);
// Skip non-labellist columns - or if we already had a labellist
if (done || !LabelList.class.equals(meta.getRestrictionClass())) {
bundle.appendColumn(meta, objects.getColumn(i));
continue;
}
done = true;
// We split the label column into two parts
List<ExternalID> eidcol = new ArrayList<>(objects.dataLength());
List<LabelList> lblcol = new ArrayList<>(objects.dataLength());
// Split the column
ArrayList<String> lbuf = new ArrayList<>();
for (Object obj : objects.getColumn(i)) {
if (obj != null) {
LabelList ll = (LabelList) obj;
int off = externalIdIndex >= 0 ? externalIdIndex : (ll.size() - externalIdIndex);
eidcol.add(new ExternalID(ll.get(off)));
lbuf.clear();
for (int j = 0; j < ll.size(); j++) {
if (j == off) {
continue;
}
lbuf.add(ll.get(j));
}
lblcol.add(LabelList.make(lbuf));
if (ll.size() > 0) {
keeplabelcol = true;
}
} else {
eidcol.add(null);
lblcol.add(null);
}
}
bundle.appendColumn(TypeUtil.EXTERNALID, eidcol);
// Only add the label column when it's not empty.
if (keeplabelcol) {
bundle.appendColumn(meta, lblcol);
}
}
return bundle;
}
use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class ArffParser method parse.
@Override
public MultipleObjectsBundle parse(InputStream instream) {
try (InputStreamReader ir = new InputStreamReader(instream);
BufferedReader br = new BufferedReader(ir)) {
ArrayList<String> names = new ArrayList<>(), types = new ArrayList<>();
readHeader(br);
parseAttributeStatements(br, names, types);
// Convert into column mapping. Prepare arrays to fill
int[] targ = new int[names.size()];
TypeInformation[] elkitypes = new TypeInformation[names.size()];
int[] dimsize = new int[names.size()];
processColumnTypes(names, types, targ, elkitypes, dimsize);
// Prepare bundle:
// This is a bit complicated to produce vector fields.
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
StreamTokenizer tokenizer = makeArffTokenizer(br);
int state = 0;
nextToken(tokenizer);
while (tokenizer.ttype != StreamTokenizer.TT_EOF) {
// Parse instance
if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
// ignore empty lines
} else if (tokenizer.ttype != '{') {
if (state == 0) {
setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, false);
// dense
state = 1;
} else if (state != 1) {
throw new AbortException("Mixing dense and sparse vectors is currently not allowed.");
}
// Load a dense instance
bundle.appendSimple(loadDenseInstance(tokenizer, dimsize, elkitypes, bundle.metaLength()));
} else {
if (state == 0) {
setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, true);
// sparse
state = 2;
} else if (state != 2) {
throw new AbortException("Mixing dense and sparse vectors is currently not allowed.");
}
bundle.appendSimple(loadSparseInstance(tokenizer, targ, dimsize, elkitypes, bundle.metaLength()));
}
nextToken(tokenizer);
}
return bundle;
} catch (IOException e) {
throw new AbortException("IO error in parser", e);
}
}
use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class ReplaceNaNWithRandomFilterTest method parameters.
/**
* Test with standard normal distribution as parameter.
*/
@Test
public void parameters() {
String filename = UNITTEST + "nan-test-1.csv";
ReplaceNaNWithRandomFilter filter = //
new ELKIBuilder<>(ReplaceNaNWithRandomFilter.class).with(//
ReplaceNaNWithRandomFilter.Parameterizer.REPLACEMENT_DISTRIBUTION, new NormalDistribution(0, 1, new Random(0L))).build();
MultipleObjectsBundle filteredBundle = readBundle(filename, filter);
// Load the test data again without a filter.
MultipleObjectsBundle unfilteredBundle = readBundle(filename);
// Ensure the first column are the vectors.
assertTrue("Test file not as expected", TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(filteredBundle.meta(0)));
assertTrue("Test file not as expected", TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(unfilteredBundle.meta(0)));
// This cast is now safe (vector field):
int dimFiltered = ((FieldTypeInformation) unfilteredBundle.meta(0)).getDimensionality();
int dimUnfiltered = ((FieldTypeInformation) unfilteredBundle.meta(0)).getDimensionality();
assertEquals("Dimensionality expected equal", dimFiltered, dimUnfiltered);
// Note the indices of the NaN(s) in the data.
List<IntegerVector> NaNs = new ArrayList<IntegerVector>();
for (int row = 0; row < unfilteredBundle.dataLength(); row++) {
Object obj = unfilteredBundle.data(row, 0);
assertEquals("Unexpected data type", DoubleVector.class, obj.getClass());
DoubleVector d = (DoubleVector) obj;
for (int col = 0; col < dimUnfiltered; col++) {
final double v = d.doubleValue(col);
if (Double.isNaN(v)) {
NaNs.add(new IntegerVector(new int[] { row, col }));
}
}
}
// Verify that at least a single NaN exists in the unfiltered bundle.
assertTrue("NaN expected in unfiltered data", NaNs.size() > 0);
for (IntegerVector iv : NaNs) {
Object obj = filteredBundle.data(iv.intValue(0), 0);
assertEquals("Unexpected data type", DoubleVector.class, obj.getClass());
DoubleVector d = (DoubleVector) obj;
final double v = d.doubleValue(iv.intValue(1));
assertFalse("NaN not expected", Double.isNaN(v));
}
}
use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class AttributeWiseBetaNormalizationTest method parameters.
/**
* Test with parameter p as alpha.
*/
@Test
public void parameters() {
final double p = .88;
String filename = UNITTEST + "normally-distributed-data-1.csv";
AttributeWiseBetaNormalization<DoubleVector> filter = //
new ELKIBuilder<AttributeWiseBetaNormalization<DoubleVector>>(AttributeWiseBetaNormalization.class).with(AttributeWiseBetaNormalization.Parameterizer.ALPHA_ID, //
p).with(//
AttributeWiseBetaNormalization.Parameterizer.DISTRIBUTIONS_ID, //
Arrays.asList(NormalMOMEstimator.STATIC, UniformMinMaxEstimator.STATIC)).build();
MultipleObjectsBundle bundle = readBundle(filename, filter);
int dim = getFieldDimensionality(bundle, 0, TypeUtil.NUMBER_VECTOR_FIELD);
BetaDistribution dist = new BetaDistribution(p, p);
final double quantile = dist.quantile(p);
// Verify that p% of the values in each column are less than the quantile.
int[] countUnderQuantile = new int[dim];
for (int row = 0; row < bundle.dataLength(); row++) {
DoubleVector d = get(bundle, row, 0, DoubleVector.class);
for (int col = 0; col < dim; col++) {
final double v = d.doubleValue(col);
if (v > Double.NEGATIVE_INFINITY && v < Double.POSITIVE_INFINITY) {
if (v < quantile) {
countUnderQuantile[col]++;
}
}
}
}
for (int col = 0; col < dim; col++) {
double actual = countUnderQuantile[col] / (double) bundle.dataLength();
assertEquals("p% of the values should be under the quantile", p, actual, .05);
}
}
use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class AttributeWiseMinMaxNormalizationTest method testNaNParameters.
/**
* Test with default parameters and for correcting handling of NaN and Inf.
*/
@Test
public void testNaNParameters() {
String filename = UNITTEST + "nan-test-1.csv";
AttributeWiseMinMaxNormalization<DoubleVector> filter = new ELKIBuilder<AttributeWiseMinMaxNormalization<DoubleVector>>(AttributeWiseMinMaxNormalization.class).build();
MultipleObjectsBundle bundle = readBundle(filename, filter);
// Ensure the first column are the vectors.
assertTrue("Test file not as expected", TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(bundle.meta(0)));
// This cast is now safe (vector field):
int dim = ((FieldTypeInformation) bundle.meta(0)).getDimensionality();
// We verify that minimum and maximum values in each column are 0 and 1:
DoubleMinMax[] mms = DoubleMinMax.newArray(dim);
for (int row = 0; row < bundle.dataLength(); row++) {
DoubleVector d = get(bundle, row, 0, DoubleVector.class);
for (int col = 0; col < dim; col++) {
final double val = d.doubleValue(col);
if (val > Double.NEGATIVE_INFINITY && val < Double.POSITIVE_INFINITY) {
mms[col].put(val);
}
}
}
for (int col = 0; col < dim; col++) {
assertEquals("Minimum not as expected", 0., mms[col].getMin(), 0.);
assertEquals("Maximum not as expected", 1., mms[col].getMax(), 0.);
}
}
Aggregations