Search in sources :

Example 1 with CorpusDataException

use of de.ids_mannheim.korap.util.CorpusDataException in project Krill by KorAP.

the class Test method getTermVector.

public static MultiTermTokenStream getTermVector(String stream) {
    MultiTermTokenStream ts = new MultiTermTokenStream();
    int pos = 0;
    for (String seg : stream.split(" ")) {
        String[] tokseg = seg.split("\\|");
        try {
            MultiTermToken mtt = new MultiTermToken('s', tokseg[0]);
            mtt.add("T");
            mtt.add('i', tokseg[0].toLowerCase());
            mtt.add('p', tokseg[1]);
            mtt.add('l', tokseg[2]);
            if (tokseg.length == 4) {
                for (String morph : tokseg[3].split(";")) {
                    mtt.add('m', morph);
                }
            }
            ;
            if (tokseg.length == 5) {
                mtt.add('e', tokseg[4]);
            }
            ;
            ts.addMultiTermToken(mtt);
        } catch (CorpusDataException cde) {
            fail(cde.getErrorCode() + ": " + cde.getMessage());
        }
        ;
    }
    ;
    return ts;
}
Also used : CorpusDataException(de.ids_mannheim.korap.util.CorpusDataException) MultiTermToken(de.ids_mannheim.korap.index.MultiTermToken) MultiTermTokenStream(de.ids_mannheim.korap.index.MultiTermTokenStream)

Example 2 with CorpusDataException

use of de.ids_mannheim.korap.util.CorpusDataException in project Krill by KorAP.

the class MultiTerm method _fromString.

/*
     * Deserialize MultiTerm from string representation.
     */
private void _fromString(String term) throws CorpusDataException {
    String[] termSurface = term.split("(?<!\\\\)\\$", 2);
    // Payload is given
    if (termSurface.length == 2) {
        String payloadStr = termSurface[1];
        // Payload has a type
        if (payloadStr.charAt(0) == '<' && payloadStr.charAt(2) == '>') {
            // Rewind bytebuffer
            bb.rewind();
            // Split payload at type marker boundaries
            String[] pls = payloadStr.split("((?=<)|(?<=>))(?!\\A)");
            // Bytearray length
            l = 0;
            try {
                for (i = 1; i < pls.length; ) {
                    // Resize the bytebuffer
                    if ((bb.capacity() - l) < 8) {
                        bb = ByteBuffer.allocate(bb.capacity() + 8).put(bb.array());
                        bb.position(l);
                    }
                    ;
                    switch(pls[i - 1]) {
                        case // byte
                        "<b>":
                            bb.put(Byte.parseByte(pls[i]));
                            l++;
                            break;
                        case // short
                        "<s>":
                            bb.putShort(Short.parseShort(pls[i]));
                            l += 2;
                            break;
                        case // integer
                        "<i>":
                            bb.putInt(Integer.parseInt(pls[i]));
                            l += 4;
                            break;
                        case // long
                        "<l>":
                            bb.putLong(Long.parseLong(pls[i]));
                            l += 8;
                            break;
                    }
                    ;
                    i += 2;
                }
                ;
                byte[] bytes = new byte[l];
                System.arraycopy(bb.array(), 0, bytes, 0, l);
                this.payload = new BytesRef(bytes);
            } catch (Exception e) {
                if (DEBUG)
                    log.warn(e.getMessage());
            }
            ;
        } else // Payload is a string
        {
            this.payload = new BytesRef(payloadStr);
        }
        ;
    }
    ;
    // Parse offset information
    stringOffset = termSurface[0].split("(?<!\\\\)\\#", 2);
    if (stringOffset.length == 2) {
        // Split start and end position of the offset
        String[] offset = stringOffset[1].split("\\-", 2);
        // Start and end is given
        if (offset.length == 2 && offset[0].length() > 0) {
            try {
                this.start = Integer.parseInt(offset[0]);
                this.end = Integer.parseInt(offset[1]);
            } catch (NumberFormatException e) {
                throw new CorpusDataException(952, "Given offset information is not numeric in " + termSurface[0]);
            }
            ;
        } else {
            throw new CorpusDataException(953, "Given offset information is incomplete in " + termSurface[0]);
        }
        ;
    }
    ;
    this.term = _unescape(stringOffset[0]);
}
Also used : CorpusDataException(de.ids_mannheim.korap.util.CorpusDataException) BytesRef(org.apache.lucene.util.BytesRef) CorpusDataException(de.ids_mannheim.korap.util.CorpusDataException)

Example 3 with CorpusDataException

use of de.ids_mannheim.korap.util.CorpusDataException in project Krill by KorAP.

the class FieldDocument method setFields.

/**
 * Deserialize token stream data (LEGACY).
 */
public void setFields(ArrayList<Map<String, Object>> fields) {
    Map<String, Object> primary = fields.remove(0);
    this.setPrimaryData((String) primary.get("primaryData"));
    for (Map<String, Object> field : fields) {
        String fieldName = (String) field.get("name");
        MultiTermTokenStream mtts = this.newMultiTermTokenStream();
        for (ArrayList<String> token : (ArrayList<ArrayList<String>>) field.get("data")) {
            try {
                MultiTermToken mtt = new MultiTermToken(token.remove(0));
                for (String term : token) {
                    mtt.add(term);
                }
                ;
                mtts.addMultiTermToken(mtt);
            } catch (CorpusDataException cde) {
                this.addError(cde.getErrorCode(), cde.getMessage());
            }
            ;
        }
        ;
        // as meta fields in the tokenization term vector
        if (field.containsKey("foundries")) {
            // TODO: Do not store positions!
            String foundries = (String) field.get("foundries");
            this.addKeyword("foundries", foundries);
            super.setFoundries(foundries);
        }
        ;
        if (field.containsKey("tokenization")) {
            String tokenization = (String) field.get("tokenization");
            this.addString("tokenization", tokenization);
            super.setTokenization(tokenization);
        }
        ;
        this.addTV(fieldName, this.getPrimaryData(), mtts);
    }
    ;
}
Also used : CorpusDataException(de.ids_mannheim.korap.util.CorpusDataException) MultiTermToken(de.ids_mannheim.korap.index.MultiTermToken) MultiTermTokenStream(de.ids_mannheim.korap.index.MultiTermTokenStream)

Example 4 with CorpusDataException

use of de.ids_mannheim.korap.util.CorpusDataException in project Krill by KorAP.

the class FieldDocument method setData.

/**
 * Deserialize token stream data.
 */
public void setData(Map<String, Object> node) {
    this.setPrimaryData((String) node.get("text"));
    String fieldName = (String) node.get("name");
    MultiTermTokenStream mtts = this.newMultiTermTokenStream();
    // Iterate over all tokens in stream
    for (ArrayList<String> token : (ArrayList<ArrayList<String>>) node.get("stream")) {
        try {
            // Initialize MultiTermToken
            MultiTermToken mtt = new MultiTermToken(token.remove(0));
            // Add rest of the list
            for (String term : token) {
                mtt.add(term);
            }
            ;
            // Add MultiTermToken to stream
            mtts.addMultiTermToken(mtt);
        } catch (CorpusDataException cde) {
            this.addError(cde.getErrorCode(), cde.getMessage());
        }
        ;
    }
    ;
    // Add tokenstream to fielddocument
    this.addTV(fieldName, this.getPrimaryData(), mtts);
    // Get foundry info
    if (node.containsKey("foundries"))
        this.setFoundries((String) node.get("foundries"));
    // Get layer info
    if (node.containsKey("layerInfos"))
        this.setLayerInfos((String) node.get("layerInfos"));
    // Get tokenSource info
    if (node.containsKey("tokenSource"))
        this.setTokenSource((String) node.get("tokenSource"));
}
Also used : CorpusDataException(de.ids_mannheim.korap.util.CorpusDataException) MultiTermToken(de.ids_mannheim.korap.index.MultiTermToken) MultiTermTokenStream(de.ids_mannheim.korap.index.MultiTermTokenStream)

Aggregations

CorpusDataException (de.ids_mannheim.korap.util.CorpusDataException)4 MultiTermToken (de.ids_mannheim.korap.index.MultiTermToken)3 MultiTermTokenStream (de.ids_mannheim.korap.index.MultiTermTokenStream)3 BytesRef (org.apache.lucene.util.BytesRef)1