use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.
the class TestKrillIndex method indexUnicode.
/*
* This test demonstrates the behaviour
*/
@Test
public void indexUnicode() throws IOException, QueryException {
KrillIndex ki = new KrillIndex();
FieldDocument fd = new FieldDocument();
fd.addString("name", "Peter");
// These values are canonically equivalent
// But indexed as byte sequences
fd.addTV("base", new String("ju" + "\u006E" + "\u0303" + "o") + " " + new String("ju" + "\u00F1" + "o"), "[(0-5)s:ju" + "\u006E" + "\u0303" + "o|_0$<i>0<i>5|-:t$<i>2]" + "[(6-10)s:ju" + "\u00F1" + "o|_1$<i>6<i>10]");
ki.addDoc(fd);
ki.commit();
assertEquals(1, ki.numberOf("base", "documents"));
QueryBuilder kq = new QueryBuilder("base");
Result kr = ki.search(kq.seg("s:ju" + "\u00F1" + "o").toQuery());
assertEquals(1, kr.getTotalResults());
kr = ki.search(kq.seg("s:ju" + "\u006E" + "\u0303" + "o").toQuery());
assertEquals(1, kr.getTotalResults());
}
use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.
the class TestRegexWildcardIndex method indexRegexCaseInsensitive.
@Test
public void indexRegexCaseInsensitive() throws Exception {
KrillIndex ki = new KrillIndex();
// abcabcabac
FieldDocument fd = new FieldDocument();
fd.addTV("base", "AfFe aFfFE Baum Baumgarten SteinGarten franZ HaNs Haus Efeu effe", "[(0-4)s:AfFe|i:affe|_0$<i>0<i>4|-:t$<i>10]" + "[(5-10)s:aFfFE|i:afffe|_1$<i>5<i>10]" + "[(11-15)s:Baum|i:baum|_2$<i>11<i>15]" + "[(16-26)s:Baumgarten|i:baumgarten|_3$<i>16<i>26]" + "[(27-38)s:SteinGarten|i:steingarten|_4$<i>27<i>38]" + "[(39-44)s:franZ|i:franz|_5$<i>39<i>44]" + "[(45-49)s:HaNs|i:hans|_6$<i>45<i>49]" + "[(50-54)s:Haus|i:haus|_7$<i>50<i>54]" + "[(55-59)s:Efeu|i:efeu|_8$<i>55<i>59]" + "[(60-64)s:effe|i:effe|_9$<i>60<i>64]");
ki.addDoc(fd);
ki.commit();
QueryBuilder kq = new QueryBuilder("base");
SpanQueryWrapper sqw = kq.re("s:Af*e", true);
assertEquals("SpanMultiTermQueryWrapper(base:/i:af*e/)", sqw.toQuery().toString());
Krill ks = _newKrill(sqw);
Result kr = ki.search(ks);
assertEquals((long) 2, kr.getTotalResults());
assertEquals("[[AfFe]] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("AfFe [[aFfFE]] Baum ...", kr.getMatch(1).getSnippetBrackets());
ks = _newKrill(new QueryBuilder("base").re("s:Af.*e"));
kr = ki.search(ks);
assertEquals((long) 1, kr.getTotalResults());
assertEquals("[[AfFe]] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
ks = _newKrill(new QueryBuilder("base").re("s:baum.*", true));
kr = ki.search(ks);
assertEquals((long) 2, kr.getTotalResults());
assertEquals("... aFfFE [[Baum]] Baumgarten ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... Baum [[Baumgarten]] SteinGarten ...", kr.getMatch(1).getSnippetBrackets());
ks = _newKrill(new QueryBuilder("base").re("s:.*garten", true));
kr = ki.search(ks);
assertEquals((long) 2, kr.getTotalResults());
assertEquals("... Baum [[Baumgarten]] SteinGarten ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... Baumgarten [[SteinGarten]] franZ ...", kr.getMatch(1).getSnippetBrackets());
ks = _newKrill(new QueryBuilder("base").re("s:.*garten", false));
kr = ki.search(ks);
assertEquals((long) 1, kr.getTotalResults());
assertEquals("... Baum [[Baumgarten]] SteinGarten ...", kr.getMatch(0).getSnippetBrackets());
ks = _newKrill(new QueryBuilder("base").re("s:ha.s", true));
kr = ki.search(ks);
assertEquals((long) 2, kr.getTotalResults());
assertEquals("... franZ [[HaNs]] Haus ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... HaNs [[Haus]] Efeu ...", kr.getMatch(1).getSnippetBrackets());
ks = _newKrill(new QueryBuilder("base").re("s:.*f*e", true));
kr = ki.search(ks);
assertEquals((long) 3, kr.getTotalResults());
assertEquals("[[AfFe]] aFfFE ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("AfFe [[aFfFE]] Baum ...", kr.getMatch(1).getSnippetBrackets());
assertEquals("... Efeu [[effe]]", kr.getMatch(2).getSnippetBrackets());
}
use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.
the class TestRegexWildcardIndex method indexRegex.
@Test
public void indexRegex() throws Exception {
KrillIndex ki = new KrillIndex();
// abcabcabac
FieldDocument fd = new FieldDocument();
fd.addTV("base", "affe afffe baum baumgarten steingarten franz hans haus efeu effe", "[(0-4)s:affe|_0$<i>0<i>4|-:t$<i>10]" + "[(5-10)s:afffe|_1$<i>5<i>10]" + "[(11-15)s:baum|_2$<i>11<i>15]" + "[(16-26)s:baumgarten|_3$<i>16<i>26]" + "[(27-38)s:steingarten|_4$<i>27<i>38]" + "[(39-44)s:franz|_5$<i>39<i>44]" + "[(45-49)s:hans|_6$<i>45<i>49]" + "[(50-54)s:haus|_7$<i>50<i>54]" + "[(55-59)s:efeu|_8$<i>55<i>59]" + "[(60-64)s:effe|_9$<i>60<i>64]");
ki.addDoc(fd);
ki.commit();
QueryBuilder kq = new QueryBuilder("base");
SpanQueryWrapper sqw = kq.re("s:af*e");
assertEquals("SpanMultiTermQueryWrapper(base:/s:af*e/)", sqw.toQuery().toString());
Krill ks = _newKrill(sqw);
Result kr = ki.search(ks);
assertEquals((long) 2, kr.getTotalResults());
assertEquals("[[affe]] afffe ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("affe [[afffe]] baum ...", kr.getMatch(1).getSnippetBrackets());
ks = _newKrill(kq.re("s:baum.*"));
kr = ki.search(ks);
assertEquals((long) 2, kr.getTotalResults());
assertEquals("... afffe [[baum]] baumgarten ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... baum [[baumgarten]] steingarten ...", kr.getMatch(1).getSnippetBrackets());
ks = _newKrill(kq.re("s:.....?garten"));
kr = ki.search(ks);
assertEquals((long) 2, kr.getTotalResults());
assertEquals("... baum [[baumgarten]] steingarten ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... baumgarten [[steingarten]] franz ...", kr.getMatch(1).getSnippetBrackets());
ks = _newKrill(kq.re("s:ha.s"));
kr = ki.search(ks);
assertEquals((long) 2, kr.getTotalResults());
assertEquals("... franz [[hans]] haus ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("... hans [[haus]] efeu ...", kr.getMatch(1).getSnippetBrackets());
ks = _newKrill(kq.re("s:.*ff.*"));
kr = ki.search(ks);
assertEquals((long) 3, kr.getTotalResults());
assertEquals("[[affe]] afffe ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("affe [[afffe]] baum ...", kr.getMatch(1).getSnippetBrackets());
assertEquals("... efeu [[effe]]", kr.getMatch(2).getSnippetBrackets());
}
use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.
the class TestKrill method searchSchreibgebrauchData.
/**
* This is a Schreibgebrauch ressource that didn't work for
* element queries.
*/
@Test
public void searchSchreibgebrauchData() throws IOException {
// Construct index
KrillIndex ki = new KrillIndex();
// Indexing test files
ki.addDoc(getClass().getResourceAsStream("/sgbr/BSP-2013-01-32.json.gz"), true);
ki.commit();
Krill k = new Krill(new QueryBuilder("tokens").tag("base/s:s"));
assertEquals(k.getSpanQuery().toString(), "<tokens:base/s:s />");
Result kr = k.apply(ki);
assertEquals(kr.getTotalResults(), 1);
assertEquals(kr.getMatch(0).getSnippetBrackets(), "[[Selbst ist der Jeck]]");
assertEquals(kr.getMatch(0).getTextSigle(), "PRO-DUD_BSP-2013-01.32");
}
use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.
the class TestKrill method searchJSONexpansionBug.
/**
* This is a breaking test for #179
*/
@Test
public void searchJSONexpansionBug() throws IOException {
// Construct index
KrillIndex ki = new KrillIndex();
// Indexing test files
ki.addDoc(getClass().getResourceAsStream("/wiki/00002.json.gz"), true);
ki.commit();
// Expansion bug
// der alte Digraph Aa durch Å
String json = getJsonString(getClass().getResource("/queries/bugs/expansion_bug_2.jsonld").getFile());
Result kr = new Krill(json).apply(ki);
assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " + "[[der alte Digraph Aa durch Å]] ersetzt worden, " + "in Eigennamen und Ortsnamen ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
assertEquals(kr.getTotalResults(), 1);
// TODO: base/s:t needs to be defined!!!
QueryBuilder qb = new QueryBuilder("tokens");
kr = new Krill(qb.tag("base/s:t")).apply(ki);
assertEquals(kr.getTotalResults(), 1);
// der alte Digraph Aa durch []
// Works with one document
json = getJsonString(getClass().getResource("/queries/bugs/expansion_bug.jsonld").getFile());
kr = new Krill(json).apply(ki);
// focus(254: spanContain(<tokens:base/s:t />, {254: spanNext(spanNext(spanNext(spanNext(tokens:s:der, tokens:s:alte), tokens:s:Digraph), tokens:s:Aa), spanExpansion(tokens:s:durch, []{1, 1}, right))}))
assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " + "[[der alte Digraph Aa durch Å]] ersetzt worden, " + "in Eigennamen und Ortsnamen ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
assertEquals(kr.getTotalResults(), 1);
// Now try with one file ahead
ki = new KrillIndex();
for (String i : new String[] { "00001", "00002" }) {
ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), true);
}
;
ki.commit();
// Expansion bug
// der alte Digraph Aa durch Å
json = getJsonString(getClass().getResource("/queries/bugs/expansion_bug_2.jsonld").getFile());
kr = new Krill(json).apply(ki);
assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " + "[[der alte Digraph Aa durch Å]] ersetzt worden, " + "in Eigennamen und Ortsnamen ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
assertEquals(kr.getTotalResults(), 1);
// der alte Digraph Aa durch []
json = getJsonString(getClass().getResource("/queries/bugs/expansion_bug.jsonld").getFile());
kr = new Krill(json).apply(ki);
assertEquals("... Buchstabe des Alphabetes. In Dänemark ist " + "[[der alte Digraph Aa durch Å]] ersetzt worden, " + "in Eigennamen und Ortsnamen ...", kr.getMatch(0).getSnippetBrackets());
assertEquals("WPD_AAA.00002", kr.getMatch(0).getDocID());
assertEquals(kr.getTotalResults(), 1);
}
Aggregations