Search in sources :

Example 1 with MultiTermToken

use of de.ids_mannheim.korap.index.MultiTermToken in project Krill by KorAP.

the class Test method getTermVector.

public static MultiTermTokenStream getTermVector(String stream) {
    MultiTermTokenStream ts = new MultiTermTokenStream();
    int pos = 0;
    for (String seg : stream.split(" ")) {
        String[] tokseg = seg.split("\\|");
        try {
            MultiTermToken mtt = new MultiTermToken('s', tokseg[0]);
            mtt.add("T");
            mtt.add('i', tokseg[0].toLowerCase());
            mtt.add('p', tokseg[1]);
            mtt.add('l', tokseg[2]);
            if (tokseg.length == 4) {
                for (String morph : tokseg[3].split(";")) {
                    mtt.add('m', morph);
                }
            }
            ;
            if (tokseg.length == 5) {
                mtt.add('e', tokseg[4]);
            }
            ;
            ts.addMultiTermToken(mtt);
        } catch (CorpusDataException cde) {
            fail(cde.getErrorCode() + ": " + cde.getMessage());
        }
        ;
    }
    ;
    return ts;
}
Also used : CorpusDataException(de.ids_mannheim.korap.util.CorpusDataException) MultiTermToken(de.ids_mannheim.korap.index.MultiTermToken) MultiTermTokenStream(de.ids_mannheim.korap.index.MultiTermTokenStream)

Example 2 with MultiTermToken

use of de.ids_mannheim.korap.index.MultiTermToken in project Krill by KorAP.

the class TestIndex method multiTermToken.

@Test
public void multiTermToken() throws CorpusDataException {
    MultiTermToken test = new MultiTermToken("hunde", "pos:n", "m:gen:pl");
    assertEquals(test.terms.get(0).term, "hunde");
    assertEquals(test.terms.get(1).term, "pos:n");
    assertEquals(test.terms.get(2).term, "m:gen:pl");
    test = new MultiTermToken("hunde", "pos:n", "m:gen:pl");
    assertEquals(test.terms.get(0).term, "hunde");
    assertEquals(test.terms.get(1).term, "pos:n");
    assertEquals(test.terms.get(2).term, "m:gen:pl");
}
Also used : MultiTermToken(de.ids_mannheim.korap.index.MultiTermToken) Test(org.junit.Test) Test(de.ids_mannheim.korap.Test)

Example 3 with MultiTermToken

use of de.ids_mannheim.korap.index.MultiTermToken in project Krill by KorAP.

the class FieldDocument method setFields.

/**
 * Deserialize token stream data (LEGACY).
 */
public void setFields(ArrayList<Map<String, Object>> fields) {
    Map<String, Object> primary = fields.remove(0);
    this.setPrimaryData((String) primary.get("primaryData"));
    for (Map<String, Object> field : fields) {
        String fieldName = (String) field.get("name");
        MultiTermTokenStream mtts = this.newMultiTermTokenStream();
        for (ArrayList<String> token : (ArrayList<ArrayList<String>>) field.get("data")) {
            try {
                MultiTermToken mtt = new MultiTermToken(token.remove(0));
                for (String term : token) {
                    mtt.add(term);
                }
                ;
                mtts.addMultiTermToken(mtt);
            } catch (CorpusDataException cde) {
                this.addError(cde.getErrorCode(), cde.getMessage());
            }
            ;
        }
        ;
        // as meta fields in the tokenization term vector
        if (field.containsKey("foundries")) {
            // TODO: Do not store positions!
            String foundries = (String) field.get("foundries");
            this.addKeyword("foundries", foundries);
            super.setFoundries(foundries);
        }
        ;
        if (field.containsKey("tokenization")) {
            String tokenization = (String) field.get("tokenization");
            this.addString("tokenization", tokenization);
            super.setTokenization(tokenization);
        }
        ;
        this.addTV(fieldName, this.getPrimaryData(), mtts);
    }
    ;
}
Also used : CorpusDataException(de.ids_mannheim.korap.util.CorpusDataException) MultiTermToken(de.ids_mannheim.korap.index.MultiTermToken) MultiTermTokenStream(de.ids_mannheim.korap.index.MultiTermTokenStream)

Example 4 with MultiTermToken

use of de.ids_mannheim.korap.index.MultiTermToken in project Krill by KorAP.

the class FieldDocument method setData.

/**
 * Deserialize token stream data.
 */
public void setData(Map<String, Object> node) {
    this.setPrimaryData((String) node.get("text"));
    String fieldName = (String) node.get("name");
    MultiTermTokenStream mtts = this.newMultiTermTokenStream();
    // Iterate over all tokens in stream
    for (ArrayList<String> token : (ArrayList<ArrayList<String>>) node.get("stream")) {
        try {
            // Initialize MultiTermToken
            MultiTermToken mtt = new MultiTermToken(token.remove(0));
            // Add rest of the list
            for (String term : token) {
                mtt.add(term);
            }
            ;
            // Add MultiTermToken to stream
            mtts.addMultiTermToken(mtt);
        } catch (CorpusDataException cde) {
            this.addError(cde.getErrorCode(), cde.getMessage());
        }
        ;
    }
    ;
    // Add tokenstream to fielddocument
    this.addTV(fieldName, this.getPrimaryData(), mtts);
    // Get foundry info
    if (node.containsKey("foundries"))
        this.setFoundries((String) node.get("foundries"));
    // Get layer info
    if (node.containsKey("layerInfos"))
        this.setLayerInfos((String) node.get("layerInfos"));
    // Get tokenSource info
    if (node.containsKey("tokenSource"))
        this.setTokenSource((String) node.get("tokenSource"));
}
Also used : CorpusDataException(de.ids_mannheim.korap.util.CorpusDataException) MultiTermToken(de.ids_mannheim.korap.index.MultiTermToken) MultiTermTokenStream(de.ids_mannheim.korap.index.MultiTermTokenStream)

Aggregations

MultiTermToken (de.ids_mannheim.korap.index.MultiTermToken)4 MultiTermTokenStream (de.ids_mannheim.korap.index.MultiTermTokenStream)3 CorpusDataException (de.ids_mannheim.korap.util.CorpusDataException)3 Test (de.ids_mannheim.korap.Test)1 Test (org.junit.Test)1