use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.
the class TestFieldDocument method indexExample3.
@Test
public void indexExample3() throws IOException {
// Construct index
KrillIndex ki = new KrillIndex();
// Indexing test files
for (String i : new String[] { "00001", "00002", "00003", "00004", "00005", "00006", "02439" }) {
FieldDocument fd = ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), true);
}
;
ki.commit();
QueryBuilder kq = new QueryBuilder("tokens");
Krill ks;
Result kr;
// Start creating query
// within(<s>, {1: {2: [mate/p=ADJA & mate/m=number:sg]}[opennlp/p=NN & tt/p=NN]})
ks = new Krill(kq.within(kq.tag("base/s:s"), kq.nr(1, kq.seq(kq.seg("mate/p:ADJA")).append(kq.seg("opennlp/p:NN")))));
KrillMeta meta = ks.getMeta();
meta.setCount(1);
meta.setCutOff(true);
meta.getContext().left.setCharacter(true).setLength(6);
meta.getContext().right.setToken(true).setLength(6);
assertEquals("... okal. [[Der Buchstabe A hat in {1:deutschen Texten} eine durchschnittliche Häufigkeit von 6,51 %.]] Er ist damit der sechsthäufigste Buchstabe ...", ks.apply(ki).getMatch(0).getSnippetBrackets());
}
use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.
the class TestKrillCollectionIndex method filterExampleFromLegacy.
@Test
public void filterExampleFromLegacy() throws Exception {
// Construct index
KrillIndex ki = new KrillIndex();
// Indexing test files
for (String i : new String[] { "00001", "00002", "00003", "00004", "00005", "00006", "02439" }) {
ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), true);
}
;
ki.commit();
// Create Virtual collections:
KrillCollection kc = new KrillCollection(ki);
assertEquals("Documents", 7, kc.numberOf("documents"));
// The virtual collection consists of all documents that have
// the textClass "reisen" and "freizeit"
/* kc.filter(kf.and("textClass", "reisen").and("textClass",
"freizeit-unterhaltung"));
*/
kc.fromBuilder(kc.build().andGroup().with(kc.build().term("textClass", "reisen")).with(kc.build().term("textClass", "freizeit-unterhaltung")));
assertEquals("Documents", 5, kc.numberOf("documents"));
assertEquals("Tokens", 1678, kc.numberOf("tokens"));
assertEquals("Sentences", 194, kc.numberOf("sentences"));
assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
// Subset this to all documents that have also the text
// kc.filter(kf.and("textClass", "kultur"));
/*
kc.fromBuilder(
kc.build().andGroup().with(
kc.getBuilder()
).with(
kc.build().term("textClass", "kultur")
)
);
*/
kc.filter(kc.build().term("textClass", "kultur"));
assertEquals("Documents", 1, kc.numberOf("documents"));
assertEquals("Tokens", 405, kc.numberOf("tokens"));
assertEquals("Sentences", 75, kc.numberOf("sentences"));
assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
// kc.filter(kf.and("corpusID", "WPD"));
kc.filter(kc.build().term("corpusID", "WPD"));
assertEquals("Documents", 1, kc.numberOf("documents"));
assertEquals("Tokens", 405, kc.numberOf("tokens"));
assertEquals("Sentences", 75, kc.numberOf("sentences"));
assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
// Create a query
Krill ks = new Krill(new QueryBuilder("tokens").seg("opennlp/p:NN").with("tt/p:NN"));
ks.setCollection(kc).getMeta().setStartIndex(0).setCount((short) 20).setContext(new SearchContext(true, (short) 5, true, (short) 5));
Result kr = ks.apply(ki);
/*
Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true,
(short) 5);
*/
assertEquals(kr.getTotalResults(), 70);
kc.extend(kc.build().term("textClass", "uninteresting"));
assertEquals("Documents", 1, kc.numberOf("documents"));
kc.extend(kc.build().term("textClass", "wissenschaft"));
assertEquals("Documents", 3, kc.numberOf("documents"));
assertEquals("Tokens", 1669, kc.numberOf("tokens"));
assertEquals("Sentences", 188, kc.numberOf("sentences"));
assertEquals("Paragraphs", 130, kc.numberOf("paragraphs"));
}
use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.
the class KrillQuery method _termFromJson.
// Deserialize koral:term
// TODO: Not optimal as it does not respect non-term
private SpanQueryWrapper _termFromJson(JsonNode json, boolean isSpan, RelationDirection direction) throws QueryException {
if (!json.has("@type")) {
throw new QueryException(701, "JSON-LD group has no @type attribute");
}
;
String termType = json.get("@type").asText();
Boolean isTerm = termType.equals("koral:term") ? true : false;
Boolean isCaseInsensitive = false;
if (!json.has("key") || json.get("key").asText().length() < 1) {
// why must it have an attr?
if (!json.has("attr")) {
// return new SpanRepetitionQueryWrapper();
throw new QueryException(740, "Key definition is missing in term or span");
}
}
;
// Empty koral:span hack
if (isSpan) {
isTerm = false;
}
;
// <legacy>
if (json.has("caseInsensitive") && json.get("caseInsensitive").asBoolean()) {
isCaseInsensitive = true;
} else // Flags
if (json.has("flags") && json.get("flags").isArray()) {
Iterator<JsonNode> flags = json.get("flags").elements();
while (flags.hasNext()) {
String flag = flags.next().asText();
if (flag.equals("flags:caseInsensitive")) {
isCaseInsensitive = true;
} else {
this.addWarning(748, "Flag is unknown", flag);
}
;
}
;
}
;
StringBuilder value = new StringBuilder();
if (direction != null)
value.append(direction.value());
if (json.has("foundry") && json.get("foundry").asText().length() > 0) {
value.append(json.get("foundry").asText()).append('/');
}
;
// No default foundry defined
if (json.has("layer") && json.get("layer").asText().length() > 0) {
String layer = json.get("layer").asText();
switch(layer) {
case "lemma":
layer = "l";
break;
case "pos":
layer = "p";
break;
case "orth":
// TODO: THIS IS AN UGLY HACK! AND SHOULD BE NAMED "SURFACE" or . OR *
layer = ".";
break;
case "struct":
layer = "s";
break;
case "const":
layer = "c";
break;
}
;
if (isCaseInsensitive && isTerm) {
if (layer.equals("."))
layer = "i";
else {
this.addWarning(767, "Case insensitivity is currently not supported for this layer");
}
;
}
;
// Ignore foundry for orth layer
if (layer.equals(".")) {
layer = "s";
value.setLength(0);
} else if (layer.equals("i")) {
value.setLength(0);
}
;
value.append(layer).append(':');
}
;
if (json.has("key") && json.get("key").asText().length() > 0) {
String key = json.get("key").asText();
value.append(isCaseInsensitive ? key.toLowerCase() : key);
}
;
if (json.has("value") && json.get("value").asText().length() > 0)
value.append(':').append(json.get("value").asText());
// Regular expression or wildcard
if (isTerm) {
String match = "match:eq";
if (json.has("match")) {
match = json.get("match").asText();
}
;
if (json.has("type")) {
QueryBuilder qb = this.builder();
// Branch on type
switch(json.get("type").asText()) {
case "type:regex":
{
// The regex can be rewritten to an any token
if (value.toString().matches("^[si]:\\.[\\+\\*]\\??$")) {
return new SpanRepetitionQueryWrapper();
}
;
SpanRegexQueryWrapper srqw = qb.re(value.toString(), isCaseInsensitive);
if (match.equals("match:ne")) {
if (DEBUG)
log.trace("Term is negated");
// ssqw.makeNegative();
return this.builder().seg().without(srqw);
} else if (match.equals("match:eq")) {
return srqw;
}
throw new QueryException(741, "Match relation unknown");
}
case "type:wildcard":
{
SpanWildcardQueryWrapper swcqw = qb.wc(value.toString(), isCaseInsensitive);
if (match.equals("match:ne")) {
if (DEBUG)
log.trace("Term is negated");
// ssqw.makeNegative();
return this.builder().seg().without(swcqw);
} else if (match.equals("match:eq")) {
return swcqw;
}
;
throw new QueryException(741, "Match relation unknown");
}
case "type:string":
break;
default:
this.addWarning(746, "Term type is not supported - treated as a string");
}
;
}
;
SpanSegmentQueryWrapper ssqw = this.builder().seg(value.toString());
if (match.equals("match:ne")) {
if (DEBUG)
log.trace("Term is negated");
ssqw.makeNegative();
return this.builder().seg().without(ssqw);
} else if (match.equals("match:eq")) {
return ssqw;
} else {
throw new QueryException(741, "Match relation unknown");
}
}
;
if (json.has("attr")) {
JsonNode attrNode = json.get("attr");
if (!attrNode.has("@type")) {
throw new QueryException(701, "JSON-LD group has no @type attribute");
}
if (value.toString().isEmpty()) {
return _createElementAttrFromJson(null, json, attrNode);
// this.addWarning(771,
// "Arbitraty elements with attributes are currently not supported.");
} else {
SpanQueryWrapper elementWithIdWrapper = this.builder().tag(value.toString());
if (elementWithIdWrapper == null) {
return null;
}
return _createElementAttrFromJson(elementWithIdWrapper, json, attrNode);
}
}
;
return this.builder().tag(value.toString());
}
use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.
the class TestMatchIdentifier method indexExample1.
@Test
public void indexExample1() throws IOException {
KrillIndex ki = new KrillIndex();
ki.addDoc(createSimpleFieldDoc());
ki.commit();
QueryBuilder kq = new QueryBuilder("tokens");
Krill ks = new Krill(kq.nr(2, kq.seq(kq.seg("s:b")).append(kq.nr(kq.seg("s:a")))));
Result kr = ki.search(ks);
assertEquals("totalResults", kr.getTotalResults(), 1);
assertEquals("StartPos (0)", kr.getMatch(0).startPos, 7);
assertEquals("EndPos (0)", kr.getMatch(0).endPos, 9);
Match km = kr.getMatch(0);
assertEquals("SnippetBrackets (0)", "... bcabca[[{2:b{1:a}}]]c", km.getSnippetBrackets());
assertEquals("ID (0)", "match-c1!d1-p7-9(2)7-8(1)8-8", km.getID());
}
use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.
the class TestMatchIdentifier method indexMultipleSpanStarts.
@Test
public void indexMultipleSpanStarts() throws IOException, QueryException {
KrillIndex ki = new KrillIndex();
ki.addDoc(createSimpleFieldDoc5());
FieldDocument fd = ki.addDoc(2, getClass().getResourceAsStream("/goe/AGA-03828-new.json.gz"), true);
ki.commit();
Match km;
km = ki.getMatchInfo("match-c1!d5-p0-4", "tokens", null, null, true, false);
assertEquals("SnippetBrackets (with Spans)", "[[{x/tag:a:{x/tag:b:{x/tag:c:{x/tag:v:x}}y}}z]]", km.getSnippetBrackets());
assertEquals(fd.getTextSigle(), "GOE/AGA/03828");
assertEquals(fd.getTitle(), "Autobiographische Einzelheiten");
Krill ks = new Krill(new QueryBuilder("tokens").seg("marmot/m:case:nom").with("marmot/m:degree:pos"));
Result kr = ks.apply(ki);
assertEquals(83, kr.getTotalResults());
assertEquals("match-GOE/AGA/03828-p0-1", kr.getMatch(0).getID());
km = ki.getMatchInfo("match-GOE/AGA/03828-p0-10", "tokens", "malt", null, true, false);
assertEquals("SnippetBrackets (with Spans)", "[[{malt/d:ATTR>2:Autobiographische} " + "{malt/d:ATTR>2:einzelheiten} " + "{#2:{malt/d:ROOT>0-21:Selbstschilderung}} " + "({malt/d:APP>2:1}) " + "{malt/d:ADV>5:immer} " + "{#5:{malt/d:ATTR>2:tätiger}}, " + "{#6:{malt/d:PP>13:nach}} " + "{#7:{malt/d:PN>6:innen}} " + "{malt/d:KON>7:und} " + "{malt/d:ADV>11:außen}]] " + "...", km.getSnippetBrackets());
}
Aggregations