use of de.ids_mannheim.korap.index.FieldDocument in project Krill by KorAP.
the class TestKrill method searchJSONmultitermRewriteBug.
@Test
public void searchJSONmultitermRewriteBug() throws IOException {
// Construct index
KrillIndex ki = new KrillIndex();
assertEquals(ki.numberOf("documents"), 0);
// Indexing test files
FieldDocument fd = ki.addDoc(1, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), true);
ki.commit();
assertEquals(ki.numberOf("documents"), 1);
assertEquals("BZK", fd.getCorpusSigle());
// [tt/p="A.*"]{0,3}[tt/p="N.*"]
String json = getJsonString(getClass().getResource("/queries/bugs/multiterm_rewrite.jsonld").getFile());
Krill ks = new Krill(json);
KrillCollection kc = ks.getCollection();
// No index was set
assertEquals(-1, kc.numberOf("documents"));
kc.setIndex(ki);
// Index was set but vc restricted to WPD
assertEquals(0, kc.numberOf("documents"));
/*
kc.extend(new CollectionBuilder().or("corpusSigle", "BZK"));
*/
CollectionBuilder cb = new CollectionBuilder();
kc.fromBuilder(cb.orGroup().with(kc.getBuilder()).with(cb.term("corpusSigle", "BZK")));
ks.setCollection(kc);
assertEquals(1, kc.numberOf("documents"));
Result kr = ks.apply(ki);
assertEquals(kr.getSerialQuery(), "spanOr([SpanMultiTermQueryWrapper(tokens:/tt/p:N.*/), " + "spanNext(spanRepetition(SpanMultiTermQueryWrapper" + "(tokens:/tt/p:A.*/){1,3}), " + "SpanMultiTermQueryWrapper(tokens:/tt/p:N.*/))])");
assertEquals(kr.getTotalResults(), 58);
assertEquals(0, kr.getStartIndex());
assertEquals(kr.getMatch(0).getSnippetBrackets(), "[[Saragat-Partei]] zerfällt Rom (ADN) die von dem ...");
assertEquals(kr.getMatch(1).getSnippetBrackets(), "[[Saragat-Partei]] zerfällt Rom (ADN) die von dem ...");
assertEquals(kr.getMatch(2).getSnippetBrackets(), "Saragat-Partei zerfällt [[Rom]] (ADN) " + "die von dem Rechtssozialisten Saragat ...");
assertEquals(kr.getMatch(3).getSnippetBrackets(), "Saragat-Partei zerfällt Rom ([[ADN]]) " + "die von dem Rechtssozialisten Saragat geführte ...");
assertEquals(kr.getMatch(23).getSnippetBrackets(), "... dem Namen \"Einheitsbewegung der sozialistischen " + "Initiative\" [[eine neue politische Gruppierung]] " + "ins Leben gerufen hatten. Pressemeldungen zufolge ...");
}
use of de.ids_mannheim.korap.index.FieldDocument in project Krill by KorAP.
the class TestKrill method searchJSONcosmasBoundaryBug.
@Test
public void searchJSONcosmasBoundaryBug() throws IOException {
// Construct index
KrillIndex ki = new KrillIndex();
// Indexing test files
FieldDocument fd = ki.addDoc(1, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), true);
ki.commit();
String json = getJsonString(getClass().getResource("/queries/bugs/cosmas_boundary.jsonld").getFile());
QueryBuilder kq = new QueryBuilder("tokens");
Krill ks = new Krill(kq.focus(1, kq.contains(kq.tag("base/s:s"), kq.nr(1, kq.seg("s:Leben")))));
Result kr = ks.apply(ki);
assertEquals(kr.getSerialQuery(), "focus(1: spanContain(<tokens:base/s:s />, {1: tokens:s:Leben}))");
assertEquals(40, kr.getMatch(0).getStartPos());
assertEquals(41, kr.getMatch(0).getEndPos());
assertEquals(kr.getMatch(0).getSnippetBrackets(), "... Initiative\" eine neue politische Gruppierung ins " + "[[{1:Leben}]] gerufen hatten. Pressemeldungen zufolge haben sich ...");
// Try with high class - don't highlight
ks = new Krill(kq.focus(129, kq.contains(kq.tag("base/s:s"), kq.nr(129, kq.seg("s:Leben")))));
kr = ks.apply(ki);
assertEquals(kr.getSerialQuery(), "focus(129: spanContain(<tokens:base/s:s />, {129: tokens:s:Leben}))");
assertEquals(kr.getMatch(0).getSnippetBrackets(), "... Initiative\" eine neue politische Gruppierung ins " + "[[Leben]] gerufen hatten. Pressemeldungen zufolge haben sich ...");
ks = new Krill(json);
kr = ks.apply(ki);
assertEquals(kr.getSerialQuery(), "focus(129: spanElementDistance({129: tokens:s:Namen}, " + "{129: tokens:s:Leben}, [(base/s:s[0:1], notOrdered, notExcluded)]))");
assertEquals(kr.getMatch(0).getSnippetBrackets(), "... ihren Austritt erklärt und unter dem [[Namen \"Einheitsbewegung " + "der sozialistischen Initiative\" eine neue politische Gruppierung " + "ins Leben]] gerufen hatten. Pressemeldungen zufolge haben sich ...");
assertEquals(kr.getTotalResults(), 1);
assertEquals(0, kr.getStartIndex());
}
use of de.ids_mannheim.korap.index.FieldDocument in project Krill by KorAP.
the class TestKrill method searchJSONnewJSON2.
@Test
public void searchJSONnewJSON2() throws IOException {
// Construct index
KrillIndex ki = new KrillIndex();
// Indexing test files
FieldDocument fd = ki.addDoc(1, getClass().getResourceAsStream("/bzk/D59-00089.json.gz"), true);
ki.commit();
assertEquals(fd.getUID(), 1);
assertEquals(fd.getTextSigle(), "BZK_D59.00089");
assertEquals(fd.getDocSigle(), "BZK_D59");
assertEquals(fd.getCorpusSigle(), "BZK");
assertEquals(fd.getTitle(), "Saragat-Partei zerfällt");
assertEquals(fd.getPubDate().toString(), "19590219");
assertNull(fd.getSubTitle());
assertNull(fd.getAuthor());
assertNull(fd.getEditor());
assertEquals(fd.getPubPlace(), "Berlin");
assertNull(fd.getPublisher());
assertEquals(fd.getTextType(), "Zeitung: Tageszeitung");
assertNull(fd.getTextTypeArt());
assertEquals(fd.getTextTypeRef(), "Tageszeitung");
assertEquals(fd.getTextDomain(), "Politik");
assertEquals(fd.getCreationDate().toString(), "19590219");
assertEquals(fd.getLicense(), "ACA-NC-LC");
assertEquals(fd.getTextColumn(), "POLITIK");
// assertNull(fd.getPages());
assertEquals(fd.getTextClass(), "politik ausland");
assertNull(fd.getFileEditionStatement());
assertNull(fd.getBiblEditionStatement());
assertEquals(fd.getLanguage(), "de");
assertEquals(fd.getReference(), "Neues Deutschland, [Tageszeitung], 19.02.1959, Jg. 14," + " Berliner Ausgabe, S. 7. - Sachgebiet: Politik, " + "Originalressort: POLITIK; Saragat-Partei zerfällt");
assertNull(fd.getPublisher());
assertNull(fd.getKeywords());
assertEquals(fd.getTokenSource(), "opennlp#tokens");
assertEquals(fd.getFoundries(), "base base/paragraphs base/sentences corenlp " + "corenlp/constituency corenlp/morpho corenlp/namedentities" + " corenlp/sentences glemm glemm/morpho mate mate/morpho" + " opennlp opennlp/morpho opennlp/sentences treetagger" + " treetagger/morpho treetagger/sentences");
assertEquals(fd.getLayerInfos(), "base/s=spans corenlp/c=spans corenlp/ne=tokens" + " corenlp/p=tokens corenlp/s=spans glemm/l=tokens" + " mate/l=tokens mate/m=tokens mate/p=tokens" + " opennlp/p=tokens opennlp/s=spans tt/l=tokens" + " tt/p=tokens tt/s=spans");
assertEquals(fd.getCorpusTitle(), "Bonner Zeitungskorpus");
assertNull(fd.getCorpusSubTitle());
assertNull(fd.getCorpusAuthor());
assertNull(fd.getCorpusEditor());
assertEquals(fd.getDocTitle(), "Neues Deutschland");
assertEquals(fd.getDocSubTitle(), "Organ des Zentralkomitees der Sozialistischen " + "Einheitspartei Deutschlands");
assertNull(fd.getDocEditor());
assertNull(fd.getDocAuthor());
Krill ks = new Krill(new QueryBuilder("tokens").seg("mate/m:case:nom").with("mate/m:number:sg"));
Result kr = ks.apply(ki);
assertEquals(kr.getTotalResults(), 6);
assertEquals(0, kr.getStartIndex());
assertEquals(25, kr.getItemsPerPage());
}
use of de.ids_mannheim.korap.index.FieldDocument in project Krill by KorAP.
the class TestResult method checkJSONResult.
@Test
public void checkJSONResult() throws Exception {
KrillIndex ki = new KrillIndex();
FieldDocument fd = new FieldDocument();
fd.addString("ID", "doc-1");
fd.addString("UID", "1");
fd.addTV("base", "abab", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>4]" + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:a|i:c|_2#2-3]" + "[(3-4)s:b|i:a|_3#3-4]");
ki.addDoc(fd);
fd = new FieldDocument();
fd.addString("ID", "doc-2");
fd.addString("UID", "2");
fd.addTV("base", "aba", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>3]" + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:a|i:c|_2#2-3]");
ki.addDoc(fd);
// Commit!
ki.commit();
QueryBuilder kq = new QueryBuilder("base");
SpanQuery q = (SpanQuery) kq.or(kq.nr(1, kq.seg("s:a"))).or(kq.nr(2, kq.seg("s:b"))).toQuery();
Result kr = ki.search(q);
assertEquals((long) 7, kr.getTotalResults());
ObjectMapper mapper = new ObjectMapper();
JsonNode res = mapper.readTree(kr.toJsonString());
assertEquals(7, res.at("/meta/totalResults").asInt());
assertEquals("spanOr([{1: base:s:a}, {2: base:s:b}])", res.at("/meta/serialQuery").asText());
assertEquals(0, res.at("/startIndex").asInt());
assertEquals(25, res.at("/meta/itemsPerPage").asInt());
assertEquals("token", res.at("/meta/context/left/0").asText());
assertEquals(6, res.at("/meta/context/left/1").asInt());
assertEquals("token", res.at("/meta/context/right/0").asText());
assertEquals(6, res.at("/meta/context/right/1").asInt());
assertEquals("base", res.at("/matches/0/field").asText());
/*
Probably a Jackson bug
assertTrue(res.at("/matches/0/startMore").asBoolean());
assertTrue(res.at("/matches/0/endMore").asBoolean());
*/
assertEquals(1, res.at("/matches/0/UID").asInt());
assertEquals("doc-1", res.at("/matches/0/docID").asText());
assertEquals("match-doc-1-p0-1(1)0-0", res.at("/matches/0/matchID").asText());
assertEquals("<span class=\"context-left\"></span><span class=\"match\"><mark><mark class=\"class-1 level-0\">a</mark></mark></span><span class=\"context-right\">bab</span>", res.at("/matches/0/snippet").asText());
assertEquals("base", res.at("/matches/6/field").asText());
/*
Probably a Jackson bug
assertEquals(true, res.at("/matches/6/startMore").asBoolean());
assertEquals(true, res.at("/matches/6/endMore").asBoolean());
*/
assertEquals(2, res.at("/matches/6/UID").asInt());
assertEquals("doc-2", res.at("/matches/6/docID").asText());
assertEquals("match-doc-2-p2-3(1)2-2", res.at("/matches/6/matchID").asText());
assertEquals("<span class=\"context-left\">ab</span><span class=\"match\"><mark><mark class=\"class-1 level-0\">a</mark></mark></span><span class=\"context-right\"></span>", res.at("/matches/6/snippet").asText());
}
use of de.ids_mannheim.korap.index.FieldDocument in project Krill by KorAP.
the class TestResult method checkJSONResultForJSONInput.
@Test
public void checkJSONResultForJSONInput() throws Exception {
KrillIndex ki = new KrillIndex();
FieldDocument fd = new FieldDocument();
fd.addString("ID", "doc-1");
fd.addString("UID", "1");
fd.addTV("tokens", "abab", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>4]" + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:a|i:c|_2#2-3]" + "[(3-4)s:b|i:a|_3#3-4]");
ki.addDoc(fd);
fd = new FieldDocument();
fd.addString("ID", "doc-2");
fd.addString("UID", "2");
fd.addTV("tokens", "aba", "[(0-1)s:a|i:a|_0#0-1|-:t$<i>3]" + "[(1-2)s:b|i:b|_1#1-2]" + "[(2-3)s:a|i:c|_2#2-3]");
ki.addDoc(fd);
// Commit!
ki.commit();
String json = getString(getClass().getResource("/queries/bsp-result-check.jsonld").getFile());
Krill ks = new Krill(json);
Result kr = ks.apply(ki);
assertEquals((long) 7, kr.getTotalResults());
ObjectMapper mapper = new ObjectMapper();
JsonNode res = mapper.readTree(kr.toJsonString());
assertEquals(7, res.at("/meta/totalResults").asInt());
assertEquals("spanOr([tokens:s:a, tokens:s:b])", res.at("/meta/serialQuery").asText());
assertEquals(5, res.at("/meta/itemsPerPage").asInt());
assertEquals(0, res.at("/meta/startIndex").asInt());
// Request meta
// assertEquals(1, res.at("/meta/startPage").asInt());
assertEquals(5, res.at("/meta/count").asInt());
assertEquals("token", res.at("/meta/context/left/0").asText());
assertEquals(3, res.at("/meta/context/left/1").asInt());
assertEquals("char", res.at("/meta/context/right/0").asText());
assertEquals(6, res.at("/meta/context/right/1").asInt());
assertEquals("token", res.at("/meta/context/left/0").asText());
assertEquals(3, res.at("/meta/context/left/1").asInt());
assertEquals("char", res.at("/meta/context/right/0").asText());
assertEquals(6, res.at("/meta/context/right/1").asInt());
// Query
assertEquals("koral:group", res.at("/query/@type").asText());
assertEquals("operation:or", res.at("/query/operation").asText());
assertEquals("koral:token", res.at("/query/operands/0/@type").asText());
assertEquals("koral:term", res.at("/query/operands/0/wrap/@type").asText());
assertEquals("orth", res.at("/query/operands/0/wrap/layer").asText());
assertEquals("a", res.at("/query/operands/0/wrap/key").asText());
assertEquals("match:eq", res.at("/query/operands/0/wrap/match").asText());
assertEquals("koral:token", res.at("/query/operands/1/@type").asText());
assertEquals("koral:term", res.at("/query/operands/1/wrap/@type").asText());
assertEquals("orth", res.at("/query/operands/1/wrap/layer").asText());
assertEquals("b", res.at("/query/operands/1/wrap/key").asText());
assertEquals("match:eq", res.at("/query/operands/1/wrap/match").asText());
// Matches
assertEquals(1, res.at("/matches/0/UID").asInt());
assertEquals("doc-1", res.at("/matches/0/docID").asText());
assertEquals("match-doc-1-p0-1", res.at("/matches/0/matchID").asText());
assertEquals("<span class=\"context-left\"></span><span class=\"match\"><mark>a</mark></span><span class=\"context-right\">bab</span>", res.at("/matches/0/snippet").asText());
// No primaryData serialization
assertTrue(res.at("/matches/0/primaryData").isMissingNode());
}
Aggregations