use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.
the class ArffParser method parseAttributeStatements.
/**
* Parse the "@attribute" section of the ARFF file.
*
* @param br Input
* @param names List (to fill) of attribute names
* @param types List (to fill) of attribute types
* @throws IOException
*/
private void parseAttributeStatements(BufferedReader br, ArrayList<String> names, ArrayList<String> types) throws IOException {
String line;
// Load attribute metadata
while (true) {
line = br.readLine();
if (line == null) {
throw new AbortException(ARFF_HEADER_DATA + " not found in file.");
}
// Skip comments and empty lines
if (ARFF_COMMENT.reset(line).matches() || EMPTY.reset(line).matches()) {
continue;
}
// Break on data statement to continue
if (ARFF_HEADER_DATA.reset(line).matches()) {
break;
}
// Expect an attribute specification
Matcher matcher = ARFF_HEADER_ATTRIBUTE.reset(line);
if (matcher.matches()) {
String name = matcher.group(1);
if (name.charAt(0) == '\'' && name.charAt(name.length() - 1) == '\'') {
name = name.substring(1, name.length() - 1);
} else if (name.charAt(0) == '"' && name.charAt(name.length() - 1) == '"') {
name = name.substring(1, name.length() - 1);
}
String type = matcher.group(2);
names.add(name);
types.add(type);
// logger.warning("Attribute name: " + name + " type: " + type);
continue;
}
throw new AbortException("Unrecognized line: " + line);
}
assert (names.size() == types.size());
}
use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.
the class ArffParser method parse.
@Override
public MultipleObjectsBundle parse(InputStream instream) {
try (InputStreamReader ir = new InputStreamReader(instream);
BufferedReader br = new BufferedReader(ir)) {
ArrayList<String> names = new ArrayList<>(), types = new ArrayList<>();
readHeader(br);
parseAttributeStatements(br, names, types);
// Convert into column mapping. Prepare arrays to fill
int[] targ = new int[names.size()];
TypeInformation[] elkitypes = new TypeInformation[names.size()];
int[] dimsize = new int[names.size()];
processColumnTypes(names, types, targ, elkitypes, dimsize);
// Prepare bundle:
// This is a bit complicated to produce vector fields.
MultipleObjectsBundle bundle = new MultipleObjectsBundle();
StreamTokenizer tokenizer = makeArffTokenizer(br);
int state = 0;
nextToken(tokenizer);
while (tokenizer.ttype != StreamTokenizer.TT_EOF) {
// Parse instance
if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
// ignore empty lines
} else if (tokenizer.ttype != '{') {
if (state == 0) {
setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, false);
// dense
state = 1;
} else if (state != 1) {
throw new AbortException("Mixing dense and sparse vectors is currently not allowed.");
}
// Load a dense instance
bundle.appendSimple(loadDenseInstance(tokenizer, dimsize, elkitypes, bundle.metaLength()));
} else {
if (state == 0) {
setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, true);
// sparse
state = 2;
} else if (state != 2) {
throw new AbortException("Mixing dense and sparse vectors is currently not allowed.");
}
bundle.appendSimple(loadSparseInstance(tokenizer, targ, dimsize, elkitypes, bundle.metaLength()));
}
nextToken(tokenizer);
}
return bundle;
} catch (IOException e) {
throw new AbortException("IO error in parser", e);
}
}
use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.
the class ArffParser method loadSparseInstance.
private Object[] loadSparseInstance(StreamTokenizer tokenizer, int[] targ, int[] dimsize, TypeInformation[] elkitypes, int metaLength) throws IOException {
Int2ObjectOpenHashMap<Object> map = new Int2ObjectOpenHashMap<>();
while (true) {
nextToken(tokenizer);
assert (tokenizer.ttype != StreamTokenizer.TT_EOF && tokenizer.ttype != StreamTokenizer.TT_EOL);
if (tokenizer.ttype == '}') {
nextToken(tokenizer);
assert (tokenizer.ttype == StreamTokenizer.TT_EOF || tokenizer.ttype == StreamTokenizer.TT_EOL);
break;
} else {
// sparse token
if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
throw new AbortException("Unexpected token type encountered: " + tokenizer.toString() + " type: " + tokenizer.ttype);
}
int dim = ParseUtil.parseIntBase10(tokenizer.sval);
if (map.containsKey(dim)) {
throw new AbortException("Duplicate key in sparse vector: " + tokenizer.toString());
}
nextToken(tokenizer);
if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
map.put(dim, //
TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[targ[dim]]) ? (Double) ParseUtil.parseDouble(tokenizer.sval) : tokenizer.sval);
} else {
throw new AbortException("Unexpected token type encountered: " + tokenizer.toString());
}
}
}
Object[] data = new Object[metaLength];
for (int out = 0; out < metaLength; out++) {
// Find the first index
int s = -1;
for (int i = 0; i < targ.length; i++) {
if (targ[i] == out && s < 0) {
s = i;
break;
}
}
assert (s >= 0);
if (TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[out])) {
Int2DoubleOpenHashMap f = new Int2DoubleOpenHashMap(dimsize[out]);
for (ObjectIterator<Int2ObjectMap.Entry<Object>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
Int2ObjectMap.Entry<Object> entry = iter.next();
int i = entry.getIntKey();
if (i < s || i >= s + dimsize[out]) {
continue;
}
double v = ((Double) entry.getValue()).doubleValue();
f.put(i - s, v);
}
data[out] = new SparseDoubleVector(f, dimsize[out]);
} else if (TypeUtil.LABELLIST.equals(elkitypes[out])) {
// Build a label list out of successive labels
labels.clear();
for (ObjectIterator<Int2ObjectMap.Entry<Object>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
Int2ObjectMap.Entry<Object> entry = iter.next();
int i = entry.getIntKey();
if (i < s) {
continue;
}
if (i >= s + dimsize[out]) {
break;
}
if (labels.size() < i - s) {
LOG.warning("Sparse consecutive labels are currently not correctly supported.");
}
labels.add((String) entry.getValue());
}
data[out] = LabelList.make(labels);
} else if (TypeUtil.EXTERNALID.equals(elkitypes[out])) {
String val = (String) map.get(s);
if (val == null) {
throw new AbortException("External ID column not set in sparse instance." + tokenizer.toString());
}
data[out] = new ExternalID(val);
} else if (TypeUtil.CLASSLABEL.equals(elkitypes[out])) {
Object val = map.get(s);
if (val == null) {
throw new AbortException("Class label column not set in sparse instance." + tokenizer.toString());
}
// TODO: support other class label types.
ClassLabel lbl = new SimpleClassLabel(String.valueOf(val));
data[out] = lbl;
} else {
throw new AbortException("Unsupported type for column " + "->" + out + ": " + ((elkitypes[out] != null) ? elkitypes[out].toString() : "null"));
}
}
return data;
}
use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.
the class NaiveAgglomerativeHierarchicalClustering2 method run.
/**
* Run the algorithm
*
* @param db Database
* @param relation Relation
* @return Clustering hierarchy
*/
public Result run(Database db, Relation<O> relation) {
DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
final int size = ids.size();
if (size > 0x10000) {
throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow.");
}
LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
// Compute the initial (lower triangular) distance matrix.
double[] scratch = new double[triangleSize(size)];
DBIDArrayIter ix = ids.iter(), iy = ids.iter();
// Position counter - must agree with computeOffset!
int pos = 0;
for (int x = 0; ix.valid(); x++, ix.advance()) {
iy.seek(0);
for (int y = 0; y < x; y++, iy.advance()) {
scratch[pos] = dq.distance(ix, iy);
pos++;
}
}
// Initialize space for result:
double[] height = new double[size];
Arrays.fill(height, Double.POSITIVE_INFINITY);
// Parent node, to track merges
// have every object point to itself initially
ArrayModifiableDBIDs parent = DBIDUtil.newArray(ids);
// Active clusters, when not trivial.
Int2ReferenceMap<ModifiableDBIDs> clusters = new Int2ReferenceOpenHashMap<>();
// Repeat until everything merged, except the desired number of clusters:
final int stop = size - numclusters;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", stop, LOG) : null;
for (int i = 0; i < stop; i++) {
double min = Double.POSITIVE_INFINITY;
int minx = -1, miny = -1;
for (int x = 0; x < size; x++) {
if (height[x] < Double.POSITIVE_INFINITY) {
continue;
}
final int xbase = triangleSize(x);
for (int y = 0; y < x; y++) {
if (height[y] < Double.POSITIVE_INFINITY) {
continue;
}
final int idx = xbase + y;
if (scratch[idx] < min) {
min = scratch[idx];
minx = x;
miny = y;
}
}
}
assert (minx >= 0 && miny >= 0);
// Avoid allocating memory, by reusing existing iterators:
ix.seek(minx);
iy.seek(miny);
// Perform merge in data structure: x -> y
// Since y < x, prefer keeping y, dropping x.
height[minx] = min;
parent.set(minx, iy);
// Merge into cluster
ModifiableDBIDs cx = clusters.get(minx);
ModifiableDBIDs cy = clusters.get(miny);
if (cy == null) {
cy = DBIDUtil.newHashSet();
cy.add(iy);
}
if (cx == null) {
cy.add(ix);
} else {
cy.addDBIDs(cx);
clusters.remove(minx);
}
clusters.put(miny, cy);
// Update distance matrix. Note: miny < minx
final int xbase = triangleSize(minx), ybase = triangleSize(miny);
// Write to (y, j), with j < y
for (int j = 0; j < miny; j++) {
if (height[j] < Double.POSITIVE_INFINITY) {
continue;
}
scratch[ybase + j] = Math.min(scratch[xbase + j], scratch[ybase + j]);
}
// Write to (j, y), with y < j < x
for (int j = miny + 1; j < minx; j++) {
if (height[j] < Double.POSITIVE_INFINITY) {
continue;
}
final int jbase = triangleSize(j);
scratch[jbase + miny] = Math.min(scratch[xbase + j], scratch[jbase + miny]);
}
// Write to (j, y), with y < x < j
for (int j = minx + 1; j < size; j++) {
if (height[j] < Double.POSITIVE_INFINITY) {
continue;
}
final int jbase = triangleSize(j);
scratch[jbase + miny] = Math.min(scratch[jbase + minx], scratch[jbase + miny]);
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
// Build the clustering result
final Clustering<Model> dendrogram = new Clustering<>("Hierarchical-Clustering", "hierarchical-clustering");
for (int x = 0; x < size; x++) {
if (height[x] < Double.POSITIVE_INFINITY) {
DBIDs cids = clusters.get(x);
if (cids == null) {
ix.seek(x);
cids = DBIDUtil.deref(ix);
}
Cluster<Model> cluster = new Cluster<>("Cluster", cids);
dendrogram.addToplevelCluster(cluster);
}
}
return dendrogram;
}
use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.
the class SimpleTextLoader method run.
@Override
public void run() {
try {
final Directory dir = FSDirectory.open(index);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36));
IndexWriter writer = new IndexWriter(dir, config);
final URI suri = source.toURI();
for (File inf : source.listFiles()) {
Document doc = new Document();
String id = suri.relativize(inf.toURI()).getPath();
String text = FileUtil.slurp(new FileInputStream(inf));
doc.add(new Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("contents", text, Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
}
writer.close();
} catch (IOException e) {
throw new AbortException("I/O error in lucene.", e);
}
}
Aggregations