Search in sources :

Example 6 with QueryBuilder

use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.

the class TestFieldDocument method indexExample3.

@Test
public void indexExample3() throws IOException {
    // Construct index
    KrillIndex ki = new KrillIndex();
    // Indexing test files
    for (String i : new String[] { "00001", "00002", "00003", "00004", "00005", "00006", "02439" }) {
        FieldDocument fd = ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), true);
    }
    ;
    ki.commit();
    QueryBuilder kq = new QueryBuilder("tokens");
    Krill ks;
    Result kr;
    // Start creating query
    // within(<s>, {1: {2: [mate/p=ADJA & mate/m=number:sg]}[opennlp/p=NN & tt/p=NN]})
    ks = new Krill(kq.within(kq.tag("base/s:s"), kq.nr(1, kq.seq(kq.seg("mate/p:ADJA")).append(kq.seg("opennlp/p:NN")))));
    KrillMeta meta = ks.getMeta();
    meta.setCount(1);
    meta.setCutOff(true);
    meta.getContext().left.setCharacter(true).setLength(6);
    meta.getContext().right.setToken(true).setLength(6);
    assertEquals("... okal. [[Der Buchstabe A hat in {1:deutschen Texten} eine durchschnittliche Häufigkeit von 6,51 %.]] Er ist damit der sechsthäufigste Buchstabe ...", ks.apply(ki).getMatch(0).getSnippetBrackets());
}
Also used : Krill(de.ids_mannheim.korap.Krill) KrillMeta(de.ids_mannheim.korap.KrillMeta) QueryBuilder(de.ids_mannheim.korap.query.QueryBuilder) KrillIndex(de.ids_mannheim.korap.KrillIndex) Result(de.ids_mannheim.korap.response.Result) Test(org.junit.Test)

Example 7 with QueryBuilder

use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.

the class TestKrillCollectionIndex method filterExampleFromLegacy.

@Test
public void filterExampleFromLegacy() throws Exception {
    // Construct index
    KrillIndex ki = new KrillIndex();
    // Indexing test files
    for (String i : new String[] { "00001", "00002", "00003", "00004", "00005", "00006", "02439" }) {
        ki.addDoc(getClass().getResourceAsStream("/wiki/" + i + ".json.gz"), true);
    }
    ;
    ki.commit();
    // Create Virtual collections:
    KrillCollection kc = new KrillCollection(ki);
    assertEquals("Documents", 7, kc.numberOf("documents"));
    // The virtual collection consists of all documents that have
    // the textClass "reisen" and "freizeit"
    /*        kc.filter(kf.and("textClass", "reisen").and("textClass",
                "freizeit-unterhaltung"));
        */
    kc.fromBuilder(kc.build().andGroup().with(kc.build().term("textClass", "reisen")).with(kc.build().term("textClass", "freizeit-unterhaltung")));
    assertEquals("Documents", 5, kc.numberOf("documents"));
    assertEquals("Tokens", 1678, kc.numberOf("tokens"));
    assertEquals("Sentences", 194, kc.numberOf("sentences"));
    assertEquals("Paragraphs", 139, kc.numberOf("paragraphs"));
    // Subset this to all documents that have also the text
    // kc.filter(kf.and("textClass", "kultur"));
    /*
        kc.fromBuilder(
          kc.build().andGroup().with(
            kc.getBuilder()
          ).with(
            kc.build().term("textClass", "kultur")
          )
        );
        */
    kc.filter(kc.build().term("textClass", "kultur"));
    assertEquals("Documents", 1, kc.numberOf("documents"));
    assertEquals("Tokens", 405, kc.numberOf("tokens"));
    assertEquals("Sentences", 75, kc.numberOf("sentences"));
    assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
    // kc.filter(kf.and("corpusID", "WPD"));
    kc.filter(kc.build().term("corpusID", "WPD"));
    assertEquals("Documents", 1, kc.numberOf("documents"));
    assertEquals("Tokens", 405, kc.numberOf("tokens"));
    assertEquals("Sentences", 75, kc.numberOf("sentences"));
    assertEquals("Paragraphs", 48, kc.numberOf("paragraphs"));
    // Create a query
    Krill ks = new Krill(new QueryBuilder("tokens").seg("opennlp/p:NN").with("tt/p:NN"));
    ks.setCollection(kc).getMeta().setStartIndex(0).setCount((short) 20).setContext(new SearchContext(true, (short) 5, true, (short) 5));
    Result kr = ks.apply(ki);
    /*
        Result kr = ki.search(kc, query, 0, (short) 20, true, (short) 5, true,
                (short) 5);
        */
    assertEquals(kr.getTotalResults(), 70);
    kc.extend(kc.build().term("textClass", "uninteresting"));
    assertEquals("Documents", 1, kc.numberOf("documents"));
    kc.extend(kc.build().term("textClass", "wissenschaft"));
    assertEquals("Documents", 3, kc.numberOf("documents"));
    assertEquals("Tokens", 1669, kc.numberOf("tokens"));
    assertEquals("Sentences", 188, kc.numberOf("sentences"));
    assertEquals("Paragraphs", 130, kc.numberOf("paragraphs"));
}
Also used : Krill(de.ids_mannheim.korap.Krill) SearchContext(de.ids_mannheim.korap.response.SearchContext) QueryBuilder(de.ids_mannheim.korap.query.QueryBuilder) KrillIndex(de.ids_mannheim.korap.KrillIndex) KrillCollection(de.ids_mannheim.korap.KrillCollection) Result(de.ids_mannheim.korap.response.Result) Test(org.junit.Test)

Example 8 with QueryBuilder

use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.

the class KrillQuery method _termFromJson.

// Deserialize koral:term
// TODO: Not optimal as it does not respect non-term
private SpanQueryWrapper _termFromJson(JsonNode json, boolean isSpan, RelationDirection direction) throws QueryException {
    if (!json.has("@type")) {
        throw new QueryException(701, "JSON-LD group has no @type attribute");
    }
    ;
    String termType = json.get("@type").asText();
    Boolean isTerm = termType.equals("koral:term") ? true : false;
    Boolean isCaseInsensitive = false;
    if (!json.has("key") || json.get("key").asText().length() < 1) {
        // why must it have an attr?
        if (!json.has("attr")) {
            // return new SpanRepetitionQueryWrapper();
            throw new QueryException(740, "Key definition is missing in term or span");
        }
    }
    ;
    // Empty koral:span hack
    if (isSpan) {
        isTerm = false;
    }
    ;
    // <legacy>
    if (json.has("caseInsensitive") && json.get("caseInsensitive").asBoolean()) {
        isCaseInsensitive = true;
    } else // Flags
    if (json.has("flags") && json.get("flags").isArray()) {
        Iterator<JsonNode> flags = json.get("flags").elements();
        while (flags.hasNext()) {
            String flag = flags.next().asText();
            if (flag.equals("flags:caseInsensitive")) {
                isCaseInsensitive = true;
            } else {
                this.addWarning(748, "Flag is unknown", flag);
            }
            ;
        }
        ;
    }
    ;
    StringBuilder value = new StringBuilder();
    if (direction != null)
        value.append(direction.value());
    if (json.has("foundry") && json.get("foundry").asText().length() > 0) {
        value.append(json.get("foundry").asText()).append('/');
    }
    ;
    // No default foundry defined
    if (json.has("layer") && json.get("layer").asText().length() > 0) {
        String layer = json.get("layer").asText();
        switch(layer) {
            case "lemma":
                layer = "l";
                break;
            case "pos":
                layer = "p";
                break;
            case "orth":
                // TODO: THIS IS AN UGLY HACK! AND SHOULD BE NAMED "SURFACE" or . OR *
                layer = ".";
                break;
            case "struct":
                layer = "s";
                break;
            case "const":
                layer = "c";
                break;
        }
        ;
        if (isCaseInsensitive && isTerm) {
            if (layer.equals("."))
                layer = "i";
            else {
                this.addWarning(767, "Case insensitivity is currently not supported for this layer");
            }
            ;
        }
        ;
        // Ignore foundry for orth layer
        if (layer.equals(".")) {
            layer = "s";
            value.setLength(0);
        } else if (layer.equals("i")) {
            value.setLength(0);
        }
        ;
        value.append(layer).append(':');
    }
    ;
    if (json.has("key") && json.get("key").asText().length() > 0) {
        String key = json.get("key").asText();
        value.append(isCaseInsensitive ? key.toLowerCase() : key);
    }
    ;
    if (json.has("value") && json.get("value").asText().length() > 0)
        value.append(':').append(json.get("value").asText());
    // Regular expression or wildcard
    if (isTerm) {
        String match = "match:eq";
        if (json.has("match")) {
            match = json.get("match").asText();
        }
        ;
        if (json.has("type")) {
            QueryBuilder qb = this.builder();
            // Branch on type
            switch(json.get("type").asText()) {
                case "type:regex":
                    {
                        // The regex can be rewritten to an any token
                        if (value.toString().matches("^[si]:\\.[\\+\\*]\\??$")) {
                            return new SpanRepetitionQueryWrapper();
                        }
                        ;
                        SpanRegexQueryWrapper srqw = qb.re(value.toString(), isCaseInsensitive);
                        if (match.equals("match:ne")) {
                            if (DEBUG)
                                log.trace("Term is negated");
                            // ssqw.makeNegative();
                            return this.builder().seg().without(srqw);
                        } else if (match.equals("match:eq")) {
                            return srqw;
                        }
                        throw new QueryException(741, "Match relation unknown");
                    }
                case "type:wildcard":
                    {
                        SpanWildcardQueryWrapper swcqw = qb.wc(value.toString(), isCaseInsensitive);
                        if (match.equals("match:ne")) {
                            if (DEBUG)
                                log.trace("Term is negated");
                            // ssqw.makeNegative();
                            return this.builder().seg().without(swcqw);
                        } else if (match.equals("match:eq")) {
                            return swcqw;
                        }
                        ;
                        throw new QueryException(741, "Match relation unknown");
                    }
                case "type:string":
                    break;
                default:
                    this.addWarning(746, "Term type is not supported - treated as a string");
            }
            ;
        }
        ;
        SpanSegmentQueryWrapper ssqw = this.builder().seg(value.toString());
        if (match.equals("match:ne")) {
            if (DEBUG)
                log.trace("Term is negated");
            ssqw.makeNegative();
            return this.builder().seg().without(ssqw);
        } else if (match.equals("match:eq")) {
            return ssqw;
        } else {
            throw new QueryException(741, "Match relation unknown");
        }
    }
    ;
    if (json.has("attr")) {
        JsonNode attrNode = json.get("attr");
        if (!attrNode.has("@type")) {
            throw new QueryException(701, "JSON-LD group has no @type attribute");
        }
        if (value.toString().isEmpty()) {
            return _createElementAttrFromJson(null, json, attrNode);
        // this.addWarning(771,
        // "Arbitraty elements with attributes are currently not supported.");
        } else {
            SpanQueryWrapper elementWithIdWrapper = this.builder().tag(value.toString());
            if (elementWithIdWrapper == null) {
                return null;
            }
            return _createElementAttrFromJson(elementWithIdWrapper, json, attrNode);
        }
    }
    ;
    return this.builder().tag(value.toString());
}
Also used : SpanRepetitionQueryWrapper(de.ids_mannheim.korap.query.wrap.SpanRepetitionQueryWrapper) SpanWildcardQueryWrapper(de.ids_mannheim.korap.query.wrap.SpanWildcardQueryWrapper) QueryException(de.ids_mannheim.korap.util.QueryException) SpanRegexQueryWrapper(de.ids_mannheim.korap.query.wrap.SpanRegexQueryWrapper) SpanSegmentQueryWrapper(de.ids_mannheim.korap.query.wrap.SpanSegmentQueryWrapper) Iterator(java.util.Iterator) JsonNode(com.fasterxml.jackson.databind.JsonNode) QueryBuilder(de.ids_mannheim.korap.query.QueryBuilder) SpanQueryWrapper(de.ids_mannheim.korap.query.wrap.SpanQueryWrapper)

Example 9 with QueryBuilder

use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.

the class TestMatchIdentifier method indexExample1.

@Test
public void indexExample1() throws IOException {
    KrillIndex ki = new KrillIndex();
    ki.addDoc(createSimpleFieldDoc());
    ki.commit();
    QueryBuilder kq = new QueryBuilder("tokens");
    Krill ks = new Krill(kq.nr(2, kq.seq(kq.seg("s:b")).append(kq.nr(kq.seg("s:a")))));
    Result kr = ki.search(ks);
    assertEquals("totalResults", kr.getTotalResults(), 1);
    assertEquals("StartPos (0)", kr.getMatch(0).startPos, 7);
    assertEquals("EndPos (0)", kr.getMatch(0).endPos, 9);
    Match km = kr.getMatch(0);
    assertEquals("SnippetBrackets (0)", "... bcabca[[{2:b{1:a}}]]c", km.getSnippetBrackets());
    assertEquals("ID (0)", "match-c1!d1-p7-9(2)7-8(1)8-8", km.getID());
}
Also used : Krill(de.ids_mannheim.korap.Krill) QueryBuilder(de.ids_mannheim.korap.query.QueryBuilder) KrillIndex(de.ids_mannheim.korap.KrillIndex) Result(de.ids_mannheim.korap.response.Result) Match(de.ids_mannheim.korap.response.Match) Test(org.junit.Test)

Example 10 with QueryBuilder

use of de.ids_mannheim.korap.query.QueryBuilder in project Krill by KorAP.

the class TestMatchIdentifier method indexMultipleSpanStarts.

@Test
public void indexMultipleSpanStarts() throws IOException, QueryException {
    KrillIndex ki = new KrillIndex();
    ki.addDoc(createSimpleFieldDoc5());
    FieldDocument fd = ki.addDoc(2, getClass().getResourceAsStream("/goe/AGA-03828-new.json.gz"), true);
    ki.commit();
    Match km;
    km = ki.getMatchInfo("match-c1!d5-p0-4", "tokens", null, null, true, false);
    assertEquals("SnippetBrackets (with Spans)", "[[{x/tag:a:{x/tag:b:{x/tag:c:{x/tag:v:x}}y}}z]]", km.getSnippetBrackets());
    assertEquals(fd.getTextSigle(), "GOE/AGA/03828");
    assertEquals(fd.getTitle(), "Autobiographische Einzelheiten");
    Krill ks = new Krill(new QueryBuilder("tokens").seg("marmot/m:case:nom").with("marmot/m:degree:pos"));
    Result kr = ks.apply(ki);
    assertEquals(83, kr.getTotalResults());
    assertEquals("match-GOE/AGA/03828-p0-1", kr.getMatch(0).getID());
    km = ki.getMatchInfo("match-GOE/AGA/03828-p0-10", "tokens", "malt", null, true, false);
    assertEquals("SnippetBrackets (with Spans)", "[[{malt/d:ATTR>2:Autobiographische} " + "{malt/d:ATTR>2:einzelheiten} " + "{#2:{malt/d:ROOT>0-21:Selbstschilderung}} " + "({malt/d:APP>2:1}) " + "{malt/d:ADV>5:immer} " + "{#5:{malt/d:ATTR>2:tätiger}}, " + "{#6:{malt/d:PP>13:nach}} " + "{#7:{malt/d:PN>6:innen}} " + "{malt/d:KON>7:und} " + "{malt/d:ADV>11:außen}]] " + "...", km.getSnippetBrackets());
}
Also used : Krill(de.ids_mannheim.korap.Krill) QueryBuilder(de.ids_mannheim.korap.query.QueryBuilder) FieldDocument(de.ids_mannheim.korap.index.FieldDocument) KrillIndex(de.ids_mannheim.korap.KrillIndex) Match(de.ids_mannheim.korap.response.Match) Result(de.ids_mannheim.korap.response.Result) Test(org.junit.Test)

Aggregations

QueryBuilder (de.ids_mannheim.korap.query.QueryBuilder)72 Test (org.junit.Test)67 SpanQuery (org.apache.lucene.search.spans.SpanQuery)39 KrillIndex (de.ids_mannheim.korap.KrillIndex)33 Result (de.ids_mannheim.korap.response.Result)32 Krill (de.ids_mannheim.korap.Krill)27 FieldDocument (de.ids_mannheim.korap.index.FieldDocument)14 SpanQueryWrapper (de.ids_mannheim.korap.query.wrap.SpanQueryWrapper)11 Match (de.ids_mannheim.korap.response.Match)8 JsonNode (com.fasterxml.jackson.databind.JsonNode)5 KrillQuery (de.ids_mannheim.korap.KrillQuery)5 QueryException (de.ids_mannheim.korap.util.QueryException)5 KrillMeta (de.ids_mannheim.korap.KrillMeta)4 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)3 KrillCollection (de.ids_mannheim.korap.KrillCollection)3 TestSimple.getJsonString (de.ids_mannheim.korap.TestSimple.getJsonString)3 CollectionBuilder (de.ids_mannheim.korap.collection.CollectionBuilder)3 SearchContext (de.ids_mannheim.korap.response.SearchContext)3 Test (de.ids_mannheim.korap.Test)2 DistanceConstraint (de.ids_mannheim.korap.query.DistanceConstraint)1