use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class ArffParserTest method sparse.
@Test
public void sparse() throws IOException {
String filename = UNITTEST + "parsertest.sparse.arff";
Parser parser = new ELKIBuilder<>(ArffParser.class).build();
MultipleObjectsBundle bundle;
try (InputStream is = open(filename);
InputStreamDatabaseConnection dbc = new InputStreamDatabaseConnection(is, null, parser)) {
bundle = dbc.loadData();
}
// Ensure that the filter has correctly formed the bundle.
// We expect that the bundle's first column is a number vector field.
// We expect that the bundle's second column is a LabelList
// Ensure the first column are the vectors.
assertTrue("Test file not as expected", TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(bundle.meta(0)));
assertTrue("Test file not as expected", TypeUtil.CLASSLABEL.isAssignableFromType(bundle.meta(1)));
assertEquals("Length", 2, bundle.dataLength());
assertEquals("Length", 4, ((NumberVector) bundle.data(0, 0)).getDimensionality());
// Sparse missing values are supposed to be 0.
NumberVector nv = (NumberVector) bundle.data(1, 0);
assertEquals("Not 0 for missing data", 0., nv.doubleValue(0), 0.);
assertEquals("Not 0 for missing data", 0., nv.doubleValue(2), 0.);
// Ensure that the third column are the LabelList objects.
assertEquals("Unexpected data type", SparseDoubleVector.class, bundle.data(0, 0).getClass());
assertEquals("Unexpected data type", SimpleClassLabel.class, bundle.data(0, 1).getClass());
}
use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class VectorDimensionalityFilterTest method parameters.
/**
* Test with parameter dim_keep as the dimensionality of the vectors to leave.
*/
@Test
public void parameters() {
final int dim_keep = 10;
String filename = UNITTEST + "dimensionality-test-2.csv";
VectorDimensionalityFilter<DoubleVector> filter = //
new ELKIBuilder<VectorDimensionalityFilter<DoubleVector>>(VectorDimensionalityFilter.class).with(VectorDimensionalityFilter.Parameterizer.DIM_P, dim_keep).build();
MultipleObjectsBundle filteredBundle = readBundle(filename, filter);
// Load the test data again without a filter.
MultipleObjectsBundle unfilteredBundle = readBundle(filename);
// Verify that the filter has removed the vectors of the wrong
// dimensionality.
boolean foundTooSmall = false;
for (int row = 0; row < unfilteredBundle.dataLength(); row++) {
Object obj = unfilteredBundle.data(row, 0);
assertEquals("Unexpected data type", DoubleVector.class, obj.getClass());
DoubleVector d = (DoubleVector) obj;
if (d.getDimensionality() != dim_keep) {
foundTooSmall = true;
break;
}
}
assertTrue("Expected a vector with filterable dimensionality", foundTooSmall);
assertTrue("Expected smaller data length", filteredBundle.dataLength() < unfilteredBundle.dataLength());
}
use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class KNNBenchmarkAlgorithm method run.
/**
* Run the algorithm.
*
* @param database Database
* @param relation Relation
* @return Null result
*/
public Result run(Database database, Relation<O> relation) {
// Get a distance and kNN query instance.
DistanceQuery<O> distQuery = database.getDistanceQuery(relation, getDistanceFunction());
KNNQuery<O> knnQuery = database.getKNNQuery(distQuery, k);
// No query set - use original database.
if (queries == null) {
final DBIDs sample = DBIDUtil.randomSample(relation.getDBIDs(), sampling, random);
FiniteProgress prog = LOG.isVeryVerbose() ? new FiniteProgress("kNN queries", sample.size(), LOG) : null;
int hash = 0;
MeanVariance mv = new MeanVariance(), mvdist = new MeanVariance();
for (DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) {
KNNList knns = knnQuery.getKNNForDBID(iditer, k);
int ichecksum = 0;
for (DBIDIter it = knns.iter(); it.valid(); it.advance()) {
ichecksum += DBIDUtil.asInteger(it);
}
hash = Util.mixHashCodes(hash, ichecksum);
mv.put(knns.size());
mvdist.put(knns.getKNNDistance());
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics("Result hashcode: " + hash);
LOG.statistics("Mean number of results: " + mv.getMean() + " +- " + mv.getNaiveStddev());
if (mvdist.getCount() > 0) {
LOG.statistics("Mean k-distance: " + mvdist.getMean() + " +- " + mvdist.getNaiveStddev());
}
}
} else {
// Separate query set.
TypeInformation res = getDistanceFunction().getInputTypeRestriction();
MultipleObjectsBundle bundle = queries.loadData();
int col = -1;
for (int i = 0; i < bundle.metaLength(); i++) {
if (res.isAssignableFromType(bundle.meta(i))) {
col = i;
break;
}
}
if (col < 0) {
throw new IncompatibleDataException("No compatible data type in query input was found. Expected: " + res.toString());
}
// Random sampling is a bit of hack, sorry.
// But currently, we don't (yet) have an "integer random sample" function.
DBIDRange sids = DBIDUtil.generateStaticDBIDRange(bundle.dataLength());
final DBIDs sample = DBIDUtil.randomSample(sids, sampling, random);
FiniteProgress prog = LOG.isVeryVerbose() ? new FiniteProgress("kNN queries", sample.size(), LOG) : null;
int hash = 0;
MeanVariance mv = new MeanVariance(), mvdist = new MeanVariance();
for (DBIDIter iditer = sample.iter(); iditer.valid(); iditer.advance()) {
int off = sids.binarySearch(iditer);
assert (off >= 0);
@SuppressWarnings("unchecked") O o = (O) bundle.data(off, col);
KNNList knns = knnQuery.getKNNForObject(o, k);
int ichecksum = 0;
for (DBIDIter it = knns.iter(); it.valid(); it.advance()) {
ichecksum += DBIDUtil.asInteger(it);
}
hash = Util.mixHashCodes(hash, ichecksum);
mv.put(knns.size());
mvdist.put(knns.getKNNDistance());
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics("Result hashcode: " + hash);
LOG.statistics("Mean number of results: " + mv.getMean() + " +- " + mv.getNaiveStddev());
if (mvdist.getCount() > 0) {
LOG.statistics("Mean k-distance: " + mvdist.getMean() + " +- " + mvdist.getNaiveStddev());
}
}
}
return null;
}
use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class ClassifierHoldoutEvaluationTask method run.
@Override
public void run() {
Duration ptime = LOG.newDuration("evaluation.time.load").begin();
MultipleObjectsBundle allData = databaseConnection.loadData();
holdout.initialize(allData);
LOG.statistics(ptime.end());
Duration time = LOG.newDuration("evaluation.time.total").begin();
ArrayList<ClassLabel> labels = holdout.getLabels();
int[][] confusion = new int[labels.size()][labels.size()];
for (int p = 0; p < holdout.numberOfPartitions(); p++) {
TrainingAndTestSet partition = holdout.nextPartitioning();
// Load the data set into a database structure (for indexing)
Duration dur = LOG.newDuration(this.getClass().getName() + ".fold-" + (p + 1) + ".init.time").begin();
Database db = new StaticArrayDatabase(new MultipleObjectsBundleDatabaseConnection(partition.getTraining()), indexFactories);
db.initialize();
LOG.statistics(dur.end());
// Train the classifier
dur = LOG.newDuration(this.getClass().getName() + ".fold-" + (p + 1) + ".train.time").begin();
Relation<ClassLabel> lrel = db.getRelation(TypeUtil.CLASSLABEL);
algorithm.buildClassifier(db, lrel);
LOG.statistics(dur.end());
// Evaluate the test set
dur = LOG.newDuration(this.getClass().getName() + ".fold-" + (p + 1) + ".evaluation.time").begin();
// FIXME: this part is still a big hack, unfortunately!
MultipleObjectsBundle test = partition.getTest();
int lcol = AbstractHoldout.findClassLabelColumn(test);
int tcol = (lcol == 0) ? 1 : 0;
for (int i = 0, l = test.dataLength(); i < l; ++i) {
@SuppressWarnings("unchecked") O obj = (O) test.data(i, tcol);
ClassLabel truelbl = (ClassLabel) test.data(i, lcol);
ClassLabel predlbl = algorithm.classify(obj);
int pred = Collections.binarySearch(labels, predlbl);
int real = Collections.binarySearch(labels, truelbl);
confusion[pred][real]++;
}
LOG.statistics(dur.end());
}
LOG.statistics(time.end());
ConfusionMatrix m = new ConfusionMatrix(labels, confusion);
LOG.statistics(m.toString());
}
use of de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle in project elki by elki-project.
the class LeaveOneOut method nextPartitioning.
@Override
public TrainingAndTestSet nextPartitioning() {
if (pos >= len) {
return null;
}
MultipleObjectsBundle training = new MultipleObjectsBundle();
MultipleObjectsBundle test = new MultipleObjectsBundle();
// Process column-wise.
for (int c = 0, cs = bundle.metaLength(); c < cs; ++c) {
ArrayList<Object> tr = new ArrayList<>(len - 1), te = new ArrayList<>(1);
for (int i = 0; i < bundle.dataLength(); ++i) {
((i != pos) ? tr : te).add(bundle.data(i, c));
}
training.appendColumn(bundle.meta(c), tr);
test.appendColumn(bundle.meta(c), te);
}
++pos;
return new TrainingAndTestSet(training, test, labels);
}
Aggregations