Search in sources :

Example 1 with MultiTermTokenStream

use of de.ids_mannheim.korap.index.MultiTermTokenStream in project Krill by KorAP.

the class Test method getTermVector.

public static MultiTermTokenStream getTermVector(String stream) {
    MultiTermTokenStream ts = new MultiTermTokenStream();
    int pos = 0;
    for (String seg : stream.split(" ")) {
        String[] tokseg = seg.split("\\|");
        try {
            MultiTermToken mtt = new MultiTermToken('s', tokseg[0]);
            mtt.add("T");
            mtt.add('i', tokseg[0].toLowerCase());
            mtt.add('p', tokseg[1]);
            mtt.add('l', tokseg[2]);
            if (tokseg.length == 4) {
                for (String morph : tokseg[3].split(";")) {
                    mtt.add('m', morph);
                }
            }
            ;
            if (tokseg.length == 5) {
                mtt.add('e', tokseg[4]);
            }
            ;
            ts.addMultiTermToken(mtt);
        } catch (CorpusDataException cde) {
            fail(cde.getErrorCode() + ": " + cde.getMessage());
        }
        ;
    }
    ;
    return ts;
}
Also used : CorpusDataException(de.ids_mannheim.korap.util.CorpusDataException) MultiTermToken(de.ids_mannheim.korap.index.MultiTermToken) MultiTermTokenStream(de.ids_mannheim.korap.index.MultiTermTokenStream)

Example 2 with MultiTermTokenStream

use of de.ids_mannheim.korap.index.MultiTermTokenStream in project Krill by KorAP.

the class Test method addDoc.

public static void addDoc(IndexWriter w, Map<String, String> m) throws IOException {
    Document doc = new Document();
    String[] strInt = { "pubDate" };
    String[] strStr = { "id", "corpus", "pubPlace" };
    String[] strTxt = { "title", "subtitle", "textClass" };
    // Text fields
    for (String s : strTxt) {
        doc.add(new TextField(s, m.get(s), Field.Store.YES));
    }
    ;
    // String fields
    for (String s : strStr) {
        doc.add(new StringField(s, m.get(s), Field.Store.YES));
    }
    ;
    // Integer fields
    for (String s : strInt) {
        doc.add(new IntField(s, Integer.parseInt(m.get(s)), Field.Store.YES));
    }
    ;
    FieldType textFieldWithTermVectors = new FieldType(TextField.TYPE_STORED);
    textFieldWithTermVectors.setStoreTermVectors(true);
    textFieldWithTermVectors.setStoreTermVectorOffsets(true);
    textFieldWithTermVectors.setStoreTermVectorPositions(true);
    textFieldWithTermVectors.setStoreTermVectorPayloads(true);
    Field textFieldAnalyzed = new Field("text", m.get("textStr"), textFieldWithTermVectors);
    MultiTermTokenStream ts = getTermVector(m.get("text"));
    textFieldAnalyzed.setTokenStream(ts);
    doc.add(textFieldAnalyzed);
    // Add document to writer
    w.addDocument(doc);
}
Also used : StringField(org.apache.lucene.document.StringField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IntField(org.apache.lucene.document.IntField) StringField(org.apache.lucene.document.StringField) TextField(org.apache.lucene.document.TextField) IntField(org.apache.lucene.document.IntField) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) MultiTermTokenStream(de.ids_mannheim.korap.index.MultiTermTokenStream)

Example 3 with MultiTermTokenStream

use of de.ids_mannheim.korap.index.MultiTermTokenStream in project Krill by KorAP.

the class FieldDocument method setFields.

/**
 * Deserialize token stream data (LEGACY).
 */
public void setFields(ArrayList<Map<String, Object>> fields) {
    Map<String, Object> primary = fields.remove(0);
    this.setPrimaryData((String) primary.get("primaryData"));
    for (Map<String, Object> field : fields) {
        String fieldName = (String) field.get("name");
        MultiTermTokenStream mtts = this.newMultiTermTokenStream();
        for (ArrayList<String> token : (ArrayList<ArrayList<String>>) field.get("data")) {
            try {
                MultiTermToken mtt = new MultiTermToken(token.remove(0));
                for (String term : token) {
                    mtt.add(term);
                }
                ;
                mtts.addMultiTermToken(mtt);
            } catch (CorpusDataException cde) {
                this.addError(cde.getErrorCode(), cde.getMessage());
            }
            ;
        }
        ;
        // as meta fields in the tokenization term vector
        if (field.containsKey("foundries")) {
            // TODO: Do not store positions!
            String foundries = (String) field.get("foundries");
            this.addKeyword("foundries", foundries);
            super.setFoundries(foundries);
        }
        ;
        if (field.containsKey("tokenization")) {
            String tokenization = (String) field.get("tokenization");
            this.addString("tokenization", tokenization);
            super.setTokenization(tokenization);
        }
        ;
        this.addTV(fieldName, this.getPrimaryData(), mtts);
    }
    ;
}
Also used : CorpusDataException(de.ids_mannheim.korap.util.CorpusDataException) MultiTermToken(de.ids_mannheim.korap.index.MultiTermToken) MultiTermTokenStream(de.ids_mannheim.korap.index.MultiTermTokenStream)

Example 4 with MultiTermTokenStream

use of de.ids_mannheim.korap.index.MultiTermTokenStream in project Krill by KorAP.

the class FieldDocument method setData.

/**
 * Deserialize token stream data.
 */
public void setData(Map<String, Object> node) {
    this.setPrimaryData((String) node.get("text"));
    String fieldName = (String) node.get("name");
    MultiTermTokenStream mtts = this.newMultiTermTokenStream();
    // Iterate over all tokens in stream
    for (ArrayList<String> token : (ArrayList<ArrayList<String>>) node.get("stream")) {
        try {
            // Initialize MultiTermToken
            MultiTermToken mtt = new MultiTermToken(token.remove(0));
            // Add rest of the list
            for (String term : token) {
                mtt.add(term);
            }
            ;
            // Add MultiTermToken to stream
            mtts.addMultiTermToken(mtt);
        } catch (CorpusDataException cde) {
            this.addError(cde.getErrorCode(), cde.getMessage());
        }
        ;
    }
    ;
    // Add tokenstream to fielddocument
    this.addTV(fieldName, this.getPrimaryData(), mtts);
    // Get foundry info
    if (node.containsKey("foundries"))
        this.setFoundries((String) node.get("foundries"));
    // Get layer info
    if (node.containsKey("layerInfos"))
        this.setLayerInfos((String) node.get("layerInfos"));
    // Get tokenSource info
    if (node.containsKey("tokenSource"))
        this.setTokenSource((String) node.get("tokenSource"));
}
Also used : CorpusDataException(de.ids_mannheim.korap.util.CorpusDataException) MultiTermToken(de.ids_mannheim.korap.index.MultiTermToken) MultiTermTokenStream(de.ids_mannheim.korap.index.MultiTermTokenStream)

Example 5 with MultiTermTokenStream

use of de.ids_mannheim.korap.index.MultiTermTokenStream in project Krill by KorAP.

the class TestKrillIndex method indexExample.

/*
     * Todo: Currently fields can only be set if they are
     * part of the general field set.
     * this will change soon!
     */
@Test
public void indexExample() throws IOException {
    KrillIndex ki = new KrillIndex();
    assertEquals(0, ki.numberOf("base", "documents"));
    assertEquals(0, ki.numberOf("base", "tokens"));
    assertEquals(0, ki.numberOf("base", "sentences"));
    assertEquals(0, ki.numberOf("base", "paragraphs"));
    FieldDocument fd = new FieldDocument();
    fd.addString("name", "Peter");
    fd.addInt("zahl1", 56);
    fd.addInt("zahl2", "58");
    fd.addInt("zahl3", "059");
    fd.addInt("UID", 1);
    fd.addText("teaser", "Das ist der Name der Rose");
    fd.addTV("base", "ich bau", "[(0-3)s:ich|l:ich|p:PPER|-:sentences$<i>2]" + "[(4-7)s:bau|l:bauen|p:VVFIN]");
    ki.addDoc(fd);
    fd = new FieldDocument();
    fd.addString("name", "Hans");
    fd.addInt("zahl1", 14);
    fd.addText("teaser", "Das Sein");
    fd.addInt("UID", 2);
    MultiTermTokenStream mtts = fd.newMultiTermTokenStream();
    mtts.addMultiTermToken("s:wir#0-3", "l:wir", "p:PPER");
    mtts.addMultiTermToken("s:sind#4-8", "l:sein", "p:VVFIN");
    mtts.addMeta("sentences", (int) 5);
    fd.addTV("base", "wir sind", mtts);
    ki.addDoc(fd);
    /* Save documents */
    ki.commit();
    assertEquals(2, ki.numberOf("base", "documents"));
    assertEquals(7, ki.numberOf("base", "sentences"));
    fd = new FieldDocument();
    fd.addString("name", "Frank");
    fd.addInt("zahl1", 59);
    fd.addInt("zahl2", 65);
    fd.addInt("UID", 3);
    fd.addText("teaser", "Noch ein Versuch");
    fd.addTV("base", "ich bau", "[(0-3)s:der|l:der|p:DET|-:sentences$<i>3]" + "[(4-8)s:baum|l:baum|p:NN]");
    ki.addDoc(fd);
    /* Save documents */
    ki.commit();
    assertEquals(3, ki.numberOf("base", "documents"));
    assertEquals(10, ki.numberOf("base", "sentences"));
    // KrillQuery kq = new KrillQuery("text");
    // ki.search();
    ki.getDoc("1");
}
Also used : FieldDocument(de.ids_mannheim.korap.index.FieldDocument) KrillIndex(de.ids_mannheim.korap.KrillIndex) MultiTermTokenStream(de.ids_mannheim.korap.index.MultiTermTokenStream) Test(org.junit.Test)

Aggregations

MultiTermTokenStream (de.ids_mannheim.korap.index.MultiTermTokenStream)5 MultiTermToken (de.ids_mannheim.korap.index.MultiTermToken)3 CorpusDataException (de.ids_mannheim.korap.util.CorpusDataException)3 KrillIndex (de.ids_mannheim.korap.KrillIndex)1 FieldDocument (de.ids_mannheim.korap.index.FieldDocument)1 Document (org.apache.lucene.document.Document)1 Field (org.apache.lucene.document.Field)1 FieldType (org.apache.lucene.document.FieldType)1 IntField (org.apache.lucene.document.IntField)1 StringField (org.apache.lucene.document.StringField)1 TextField (org.apache.lucene.document.TextField)1 Test (org.junit.Test)1