use of de.ids_mannheim.korap.index.MultiTermTokenStream in project Krill by KorAP.
the class Test method getTermVector.
public static MultiTermTokenStream getTermVector(String stream) {
MultiTermTokenStream ts = new MultiTermTokenStream();
int pos = 0;
for (String seg : stream.split(" ")) {
String[] tokseg = seg.split("\\|");
try {
MultiTermToken mtt = new MultiTermToken('s', tokseg[0]);
mtt.add("T");
mtt.add('i', tokseg[0].toLowerCase());
mtt.add('p', tokseg[1]);
mtt.add('l', tokseg[2]);
if (tokseg.length == 4) {
for (String morph : tokseg[3].split(";")) {
mtt.add('m', morph);
}
}
;
if (tokseg.length == 5) {
mtt.add('e', tokseg[4]);
}
;
ts.addMultiTermToken(mtt);
} catch (CorpusDataException cde) {
fail(cde.getErrorCode() + ": " + cde.getMessage());
}
;
}
;
return ts;
}
use of de.ids_mannheim.korap.index.MultiTermTokenStream in project Krill by KorAP.
the class Test method addDoc.
public static void addDoc(IndexWriter w, Map<String, String> m) throws IOException {
Document doc = new Document();
String[] strInt = { "pubDate" };
String[] strStr = { "id", "corpus", "pubPlace" };
String[] strTxt = { "title", "subtitle", "textClass" };
// Text fields
for (String s : strTxt) {
doc.add(new TextField(s, m.get(s), Field.Store.YES));
}
;
// String fields
for (String s : strStr) {
doc.add(new StringField(s, m.get(s), Field.Store.YES));
}
;
// Integer fields
for (String s : strInt) {
doc.add(new IntField(s, Integer.parseInt(m.get(s)), Field.Store.YES));
}
;
FieldType textFieldWithTermVectors = new FieldType(TextField.TYPE_STORED);
textFieldWithTermVectors.setStoreTermVectors(true);
textFieldWithTermVectors.setStoreTermVectorOffsets(true);
textFieldWithTermVectors.setStoreTermVectorPositions(true);
textFieldWithTermVectors.setStoreTermVectorPayloads(true);
Field textFieldAnalyzed = new Field("text", m.get("textStr"), textFieldWithTermVectors);
MultiTermTokenStream ts = getTermVector(m.get("text"));
textFieldAnalyzed.setTokenStream(ts);
doc.add(textFieldAnalyzed);
// Add document to writer
w.addDocument(doc);
}
use of de.ids_mannheim.korap.index.MultiTermTokenStream in project Krill by KorAP.
the class FieldDocument method setFields.
/**
* Deserialize token stream data (LEGACY).
*/
public void setFields(ArrayList<Map<String, Object>> fields) {
Map<String, Object> primary = fields.remove(0);
this.setPrimaryData((String) primary.get("primaryData"));
for (Map<String, Object> field : fields) {
String fieldName = (String) field.get("name");
MultiTermTokenStream mtts = this.newMultiTermTokenStream();
for (ArrayList<String> token : (ArrayList<ArrayList<String>>) field.get("data")) {
try {
MultiTermToken mtt = new MultiTermToken(token.remove(0));
for (String term : token) {
mtt.add(term);
}
;
mtts.addMultiTermToken(mtt);
} catch (CorpusDataException cde) {
this.addError(cde.getErrorCode(), cde.getMessage());
}
;
}
;
// as meta fields in the tokenization term vector
if (field.containsKey("foundries")) {
// TODO: Do not store positions!
String foundries = (String) field.get("foundries");
this.addKeyword("foundries", foundries);
super.setFoundries(foundries);
}
;
if (field.containsKey("tokenization")) {
String tokenization = (String) field.get("tokenization");
this.addString("tokenization", tokenization);
super.setTokenization(tokenization);
}
;
this.addTV(fieldName, this.getPrimaryData(), mtts);
}
;
}
use of de.ids_mannheim.korap.index.MultiTermTokenStream in project Krill by KorAP.
the class FieldDocument method setData.
/**
* Deserialize token stream data.
*/
public void setData(Map<String, Object> node) {
this.setPrimaryData((String) node.get("text"));
String fieldName = (String) node.get("name");
MultiTermTokenStream mtts = this.newMultiTermTokenStream();
// Iterate over all tokens in stream
for (ArrayList<String> token : (ArrayList<ArrayList<String>>) node.get("stream")) {
try {
// Initialize MultiTermToken
MultiTermToken mtt = new MultiTermToken(token.remove(0));
// Add rest of the list
for (String term : token) {
mtt.add(term);
}
;
// Add MultiTermToken to stream
mtts.addMultiTermToken(mtt);
} catch (CorpusDataException cde) {
this.addError(cde.getErrorCode(), cde.getMessage());
}
;
}
;
// Add tokenstream to fielddocument
this.addTV(fieldName, this.getPrimaryData(), mtts);
// Get foundry info
if (node.containsKey("foundries"))
this.setFoundries((String) node.get("foundries"));
// Get layer info
if (node.containsKey("layerInfos"))
this.setLayerInfos((String) node.get("layerInfos"));
// Get tokenSource info
if (node.containsKey("tokenSource"))
this.setTokenSource((String) node.get("tokenSource"));
}
use of de.ids_mannheim.korap.index.MultiTermTokenStream in project Krill by KorAP.
the class TestKrillIndex method indexExample.
/*
* Todo: Currently fields can only be set if they are
* part of the general field set.
* this will change soon!
*/
@Test
public void indexExample() throws IOException {
KrillIndex ki = new KrillIndex();
assertEquals(0, ki.numberOf("base", "documents"));
assertEquals(0, ki.numberOf("base", "tokens"));
assertEquals(0, ki.numberOf("base", "sentences"));
assertEquals(0, ki.numberOf("base", "paragraphs"));
FieldDocument fd = new FieldDocument();
fd.addString("name", "Peter");
fd.addInt("zahl1", 56);
fd.addInt("zahl2", "58");
fd.addInt("zahl3", "059");
fd.addInt("UID", 1);
fd.addText("teaser", "Das ist der Name der Rose");
fd.addTV("base", "ich bau", "[(0-3)s:ich|l:ich|p:PPER|-:sentences$<i>2]" + "[(4-7)s:bau|l:bauen|p:VVFIN]");
ki.addDoc(fd);
fd = new FieldDocument();
fd.addString("name", "Hans");
fd.addInt("zahl1", 14);
fd.addText("teaser", "Das Sein");
fd.addInt("UID", 2);
MultiTermTokenStream mtts = fd.newMultiTermTokenStream();
mtts.addMultiTermToken("s:wir#0-3", "l:wir", "p:PPER");
mtts.addMultiTermToken("s:sind#4-8", "l:sein", "p:VVFIN");
mtts.addMeta("sentences", (int) 5);
fd.addTV("base", "wir sind", mtts);
ki.addDoc(fd);
/* Save documents */
ki.commit();
assertEquals(2, ki.numberOf("base", "documents"));
assertEquals(7, ki.numberOf("base", "sentences"));
fd = new FieldDocument();
fd.addString("name", "Frank");
fd.addInt("zahl1", 59);
fd.addInt("zahl2", 65);
fd.addInt("UID", 3);
fd.addText("teaser", "Noch ein Versuch");
fd.addTV("base", "ich bau", "[(0-3)s:der|l:der|p:DET|-:sentences$<i>3]" + "[(4-8)s:baum|l:baum|p:NN]");
ki.addDoc(fd);
/* Save documents */
ki.commit();
assertEquals(3, ki.numberOf("base", "documents"));
assertEquals(10, ki.numberOf("base", "sentences"));
// KrillQuery kq = new KrillQuery("text");
// ki.search();
ki.getDoc("1");
}
Aggregations