Search in sources :

Example 6 with AbortException

use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.

the class ArffParser method parseAttributeStatements.

/**
 * Parse the "@attribute" section of the ARFF file.
 *
 * @param br Input
 * @param names List (to fill) of attribute names
 * @param types List (to fill) of attribute types
 * @throws IOException
 */
private void parseAttributeStatements(BufferedReader br, ArrayList<String> names, ArrayList<String> types) throws IOException {
    String line;
    // Load attribute metadata
    while (true) {
        line = br.readLine();
        if (line == null) {
            throw new AbortException(ARFF_HEADER_DATA + " not found in file.");
        }
        // Skip comments and empty lines
        if (ARFF_COMMENT.reset(line).matches() || EMPTY.reset(line).matches()) {
            continue;
        }
        // Break on data statement to continue
        if (ARFF_HEADER_DATA.reset(line).matches()) {
            break;
        }
        // Expect an attribute specification
        Matcher matcher = ARFF_HEADER_ATTRIBUTE.reset(line);
        if (matcher.matches()) {
            String name = matcher.group(1);
            if (name.charAt(0) == '\'' && name.charAt(name.length() - 1) == '\'') {
                name = name.substring(1, name.length() - 1);
            } else if (name.charAt(0) == '"' && name.charAt(name.length() - 1) == '"') {
                name = name.substring(1, name.length() - 1);
            }
            String type = matcher.group(2);
            names.add(name);
            types.add(type);
            // logger.warning("Attribute name: " + name + " type: " + type);
            continue;
        }
        throw new AbortException("Unrecognized line: " + line);
    }
    assert (names.size() == types.size());
}
Also used : Matcher(java.util.regex.Matcher) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 7 with AbortException

use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.

the class ArffParser method parse.

@Override
public MultipleObjectsBundle parse(InputStream instream) {
    try (InputStreamReader ir = new InputStreamReader(instream);
        BufferedReader br = new BufferedReader(ir)) {
        ArrayList<String> names = new ArrayList<>(), types = new ArrayList<>();
        readHeader(br);
        parseAttributeStatements(br, names, types);
        // Convert into column mapping. Prepare arrays to fill
        int[] targ = new int[names.size()];
        TypeInformation[] elkitypes = new TypeInformation[names.size()];
        int[] dimsize = new int[names.size()];
        processColumnTypes(names, types, targ, elkitypes, dimsize);
        // Prepare bundle:
        // This is a bit complicated to produce vector fields.
        MultipleObjectsBundle bundle = new MultipleObjectsBundle();
        StreamTokenizer tokenizer = makeArffTokenizer(br);
        int state = 0;
        nextToken(tokenizer);
        while (tokenizer.ttype != StreamTokenizer.TT_EOF) {
            // Parse instance
            if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
            // ignore empty lines
            } else if (tokenizer.ttype != '{') {
                if (state == 0) {
                    setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, false);
                    // dense
                    state = 1;
                } else if (state != 1) {
                    throw new AbortException("Mixing dense and sparse vectors is currently not allowed.");
                }
                // Load a dense instance
                bundle.appendSimple(loadDenseInstance(tokenizer, dimsize, elkitypes, bundle.metaLength()));
            } else {
                if (state == 0) {
                    setupBundleHeaders(names, targ, elkitypes, dimsize, bundle, true);
                    // sparse
                    state = 2;
                } else if (state != 2) {
                    throw new AbortException("Mixing dense and sparse vectors is currently not allowed.");
                }
                bundle.appendSimple(loadSparseInstance(tokenizer, targ, dimsize, elkitypes, bundle.metaLength()));
            }
            nextToken(tokenizer);
        }
        return bundle;
    } catch (IOException e) {
        throw new AbortException("IO error in parser", e);
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) ArrayList(java.util.ArrayList) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) IOException(java.io.IOException) VectorFieldTypeInformation(de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation) TypeInformation(de.lmu.ifi.dbs.elki.data.type.TypeInformation) SimpleTypeInformation(de.lmu.ifi.dbs.elki.data.type.SimpleTypeInformation) BufferedReader(java.io.BufferedReader) StreamTokenizer(java.io.StreamTokenizer) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 8 with AbortException

use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.

the class ArffParser method loadSparseInstance.

private Object[] loadSparseInstance(StreamTokenizer tokenizer, int[] targ, int[] dimsize, TypeInformation[] elkitypes, int metaLength) throws IOException {
    Int2ObjectOpenHashMap<Object> map = new Int2ObjectOpenHashMap<>();
    while (true) {
        nextToken(tokenizer);
        assert (tokenizer.ttype != StreamTokenizer.TT_EOF && tokenizer.ttype != StreamTokenizer.TT_EOL);
        if (tokenizer.ttype == '}') {
            nextToken(tokenizer);
            assert (tokenizer.ttype == StreamTokenizer.TT_EOF || tokenizer.ttype == StreamTokenizer.TT_EOL);
            break;
        } else {
            // sparse token
            if (tokenizer.ttype != StreamTokenizer.TT_WORD) {
                throw new AbortException("Unexpected token type encountered: " + tokenizer.toString() + " type: " + tokenizer.ttype);
            }
            int dim = ParseUtil.parseIntBase10(tokenizer.sval);
            if (map.containsKey(dim)) {
                throw new AbortException("Duplicate key in sparse vector: " + tokenizer.toString());
            }
            nextToken(tokenizer);
            if (tokenizer.ttype == StreamTokenizer.TT_WORD) {
                map.put(dim, // 
                TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[targ[dim]]) ? (Double) ParseUtil.parseDouble(tokenizer.sval) : tokenizer.sval);
            } else {
                throw new AbortException("Unexpected token type encountered: " + tokenizer.toString());
            }
        }
    }
    Object[] data = new Object[metaLength];
    for (int out = 0; out < metaLength; out++) {
        // Find the first index
        int s = -1;
        for (int i = 0; i < targ.length; i++) {
            if (targ[i] == out && s < 0) {
                s = i;
                break;
            }
        }
        assert (s >= 0);
        if (TypeUtil.NUMBER_VECTOR_FIELD.equals(elkitypes[out])) {
            Int2DoubleOpenHashMap f = new Int2DoubleOpenHashMap(dimsize[out]);
            for (ObjectIterator<Int2ObjectMap.Entry<Object>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
                Int2ObjectMap.Entry<Object> entry = iter.next();
                int i = entry.getIntKey();
                if (i < s || i >= s + dimsize[out]) {
                    continue;
                }
                double v = ((Double) entry.getValue()).doubleValue();
                f.put(i - s, v);
            }
            data[out] = new SparseDoubleVector(f, dimsize[out]);
        } else if (TypeUtil.LABELLIST.equals(elkitypes[out])) {
            // Build a label list out of successive labels
            labels.clear();
            for (ObjectIterator<Int2ObjectMap.Entry<Object>> iter = map.int2ObjectEntrySet().fastIterator(); iter.hasNext(); ) {
                Int2ObjectMap.Entry<Object> entry = iter.next();
                int i = entry.getIntKey();
                if (i < s) {
                    continue;
                }
                if (i >= s + dimsize[out]) {
                    break;
                }
                if (labels.size() < i - s) {
                    LOG.warning("Sparse consecutive labels are currently not correctly supported.");
                }
                labels.add((String) entry.getValue());
            }
            data[out] = LabelList.make(labels);
        } else if (TypeUtil.EXTERNALID.equals(elkitypes[out])) {
            String val = (String) map.get(s);
            if (val == null) {
                throw new AbortException("External ID column not set in sparse instance." + tokenizer.toString());
            }
            data[out] = new ExternalID(val);
        } else if (TypeUtil.CLASSLABEL.equals(elkitypes[out])) {
            Object val = map.get(s);
            if (val == null) {
                throw new AbortException("Class label column not set in sparse instance." + tokenizer.toString());
            }
            // TODO: support other class label types.
            ClassLabel lbl = new SimpleClassLabel(String.valueOf(val));
            data[out] = lbl;
        } else {
            throw new AbortException("Unsupported type for column " + "->" + out + ": " + ((elkitypes[out] != null) ? elkitypes[out].toString() : "null"));
        }
    }
    return data;
}
Also used : Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) ExternalID(de.lmu.ifi.dbs.elki.data.ExternalID) Int2ObjectMap(it.unimi.dsi.fastutil.ints.Int2ObjectMap) SimpleClassLabel(de.lmu.ifi.dbs.elki.data.SimpleClassLabel) SparseDoubleVector(de.lmu.ifi.dbs.elki.data.SparseDoubleVector) ObjectIterator(it.unimi.dsi.fastutil.objects.ObjectIterator) SimpleClassLabel(de.lmu.ifi.dbs.elki.data.SimpleClassLabel) ClassLabel(de.lmu.ifi.dbs.elki.data.ClassLabel) Int2DoubleOpenHashMap(it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Example 9 with AbortException

use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.

the class NaiveAgglomerativeHierarchicalClustering2 method run.

/**
 * Run the algorithm
 *
 * @param db Database
 * @param relation Relation
 * @return Clustering hierarchy
 */
public Result run(Database db, Relation<O> relation) {
    DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
    ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
    final int size = ids.size();
    if (size > 0x10000) {
        throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow.");
    }
    LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
    // Compute the initial (lower triangular) distance matrix.
    double[] scratch = new double[triangleSize(size)];
    DBIDArrayIter ix = ids.iter(), iy = ids.iter();
    // Position counter - must agree with computeOffset!
    int pos = 0;
    for (int x = 0; ix.valid(); x++, ix.advance()) {
        iy.seek(0);
        for (int y = 0; y < x; y++, iy.advance()) {
            scratch[pos] = dq.distance(ix, iy);
            pos++;
        }
    }
    // Initialize space for result:
    double[] height = new double[size];
    Arrays.fill(height, Double.POSITIVE_INFINITY);
    // Parent node, to track merges
    // have every object point to itself initially
    ArrayModifiableDBIDs parent = DBIDUtil.newArray(ids);
    // Active clusters, when not trivial.
    Int2ReferenceMap<ModifiableDBIDs> clusters = new Int2ReferenceOpenHashMap<>();
    // Repeat until everything merged, except the desired number of clusters:
    final int stop = size - numclusters;
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", stop, LOG) : null;
    for (int i = 0; i < stop; i++) {
        double min = Double.POSITIVE_INFINITY;
        int minx = -1, miny = -1;
        for (int x = 0; x < size; x++) {
            if (height[x] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int xbase = triangleSize(x);
            for (int y = 0; y < x; y++) {
                if (height[y] < Double.POSITIVE_INFINITY) {
                    continue;
                }
                final int idx = xbase + y;
                if (scratch[idx] < min) {
                    min = scratch[idx];
                    minx = x;
                    miny = y;
                }
            }
        }
        assert (minx >= 0 && miny >= 0);
        // Avoid allocating memory, by reusing existing iterators:
        ix.seek(minx);
        iy.seek(miny);
        // Perform merge in data structure: x -> y
        // Since y < x, prefer keeping y, dropping x.
        height[minx] = min;
        parent.set(minx, iy);
        // Merge into cluster
        ModifiableDBIDs cx = clusters.get(minx);
        ModifiableDBIDs cy = clusters.get(miny);
        if (cy == null) {
            cy = DBIDUtil.newHashSet();
            cy.add(iy);
        }
        if (cx == null) {
            cy.add(ix);
        } else {
            cy.addDBIDs(cx);
            clusters.remove(minx);
        }
        clusters.put(miny, cy);
        // Update distance matrix. Note: miny < minx
        final int xbase = triangleSize(minx), ybase = triangleSize(miny);
        // Write to (y, j), with j < y
        for (int j = 0; j < miny; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            scratch[ybase + j] = Math.min(scratch[xbase + j], scratch[ybase + j]);
        }
        // Write to (j, y), with y < j < x
        for (int j = miny + 1; j < minx; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int jbase = triangleSize(j);
            scratch[jbase + miny] = Math.min(scratch[xbase + j], scratch[jbase + miny]);
        }
        // Write to (j, y), with y < x < j
        for (int j = minx + 1; j < size; j++) {
            if (height[j] < Double.POSITIVE_INFINITY) {
                continue;
            }
            final int jbase = triangleSize(j);
            scratch[jbase + miny] = Math.min(scratch[jbase + minx], scratch[jbase + miny]);
        }
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    // Build the clustering result
    final Clustering<Model> dendrogram = new Clustering<>("Hierarchical-Clustering", "hierarchical-clustering");
    for (int x = 0; x < size; x++) {
        if (height[x] < Double.POSITIVE_INFINITY) {
            DBIDs cids = clusters.get(x);
            if (cids == null) {
                ix.seek(x);
                cids = DBIDUtil.deref(ix);
            }
            Cluster<Model> cluster = new Cluster<>("Cluster", cids);
            dendrogram.addToplevelCluster(cluster);
        }
    }
    return dendrogram;
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Cluster(de.lmu.ifi.dbs.elki.data.Cluster) Clustering(de.lmu.ifi.dbs.elki.data.Clustering) Model(de.lmu.ifi.dbs.elki.data.model.Model) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException) Int2ReferenceOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ReferenceOpenHashMap)

Example 10 with AbortException

use of de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException in project elki by elki-project.

the class SimpleTextLoader method run.

@Override
public void run() {
    try {
        final Directory dir = FSDirectory.open(index);
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36));
        IndexWriter writer = new IndexWriter(dir, config);
        final URI suri = source.toURI();
        for (File inf : source.listFiles()) {
            Document doc = new Document();
            String id = suri.relativize(inf.toURI()).getPath();
            String text = FileUtil.slurp(new FileInputStream(inf));
            doc.add(new Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED));
            doc.add(new Field("contents", text, Field.Store.YES, Field.Index.ANALYZED));
            writer.addDocument(doc);
        }
        writer.close();
    } catch (IOException e) {
        throw new AbortException("I/O error in lucene.", e);
    }
}
Also used : Field(org.apache.lucene.document.Field) IndexWriter(org.apache.lucene.index.IndexWriter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) URI(java.net.URI) File(java.io.File) FileInputStream(java.io.FileInputStream) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) AbortException(de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)

Aggregations

AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)99 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)25 IOException (java.io.IOException)24 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)22 ArrayList (java.util.ArrayList)16 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)13 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)13 NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)10 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)9 DoubleRelation (de.lmu.ifi.dbs.elki.database.relation.DoubleRelation)9 Clustering (de.lmu.ifi.dbs.elki.data.Clustering)8 Model (de.lmu.ifi.dbs.elki.data.model.Model)8 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)8 Database (de.lmu.ifi.dbs.elki.database.Database)8 WritableDoubleDataStore (de.lmu.ifi.dbs.elki.database.datastore.WritableDoubleDataStore)8 DBIDRange (de.lmu.ifi.dbs.elki.database.ids.DBIDRange)8 OutlierResult (de.lmu.ifi.dbs.elki.result.outlier.OutlierResult)8 MaterializedDoubleRelation (de.lmu.ifi.dbs.elki.database.relation.MaterializedDoubleRelation)6 ClassLabel (de.lmu.ifi.dbs.elki.data.ClassLabel)5 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)5