Search in sources :

Example 1 with AnnotationToken

use of org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken in project OpenSearch by opensearch-project.

the class AnnotatedPassageFormatter method mergeAnnotations.

// Merge original annotations and search hits into a single set of markups for each passage
static MarkupPassage mergeAnnotations(AnnotationToken[] annotations, Passage passage) {
    try {
        MarkupPassage markupPassage = new MarkupPassage();
        // Add search hits first - they take precedence over any other markup
        for (int i = 0; i < passage.getNumMatches(); i++) {
            int start = passage.getMatchStarts()[i];
            int end = passage.getMatchEnds()[i];
            String searchTerm = passage.getMatchTerms()[i].utf8ToString();
            Markup markup = new Markup(start, end, SEARCH_HIT_TYPE + "=" + URLEncoder.encode(searchTerm, StandardCharsets.UTF_8.name()));
            markupPassage.addUnlessOverlapping(markup);
        }
        // Now add original text's annotations - ignoring any that might conflict with the search hits markup.
        for (AnnotationToken token : annotations) {
            int start = token.offset;
            int end = token.endOffset;
            if (start >= passage.getStartOffset() && end <= passage.getEndOffset()) {
                String escapedValue = URLEncoder.encode(token.value, StandardCharsets.UTF_8.name());
                Markup markup = new Markup(start, end, escapedValue);
                markupPassage.addUnlessOverlapping(markup);
            }
        }
        return markupPassage;
    } catch (UnsupportedEncodingException e) {
        // We should always have UTF-8 support
        throw new IllegalStateException(e);
    }
}
Also used : UnsupportedEncodingException(java.io.UnsupportedEncodingException) AnnotationToken(org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken)

Example 2 with AnnotationToken

use of org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken in project OpenSearch by opensearch-project.

the class AnnotatedPassageFormatter method getIntersectingAnnotations.

public AnnotationToken[] getIntersectingAnnotations(int start, int end) {
    List<AnnotationToken> intersectingAnnotations = new ArrayList<>();
    int fieldValueOffset = 0;
    for (AnnotatedText fieldValueAnnotations : this.annotations) {
        // the previous values AND the MULTIVAL delimiter
        for (int i = 0; i < fieldValueAnnotations.numAnnotations(); i++) {
            AnnotationToken token = fieldValueAnnotations.getAnnotation(i);
            if (token.intersects(start - fieldValueOffset, end - fieldValueOffset)) {
                intersectingAnnotations.add(new AnnotationToken(token.offset + fieldValueOffset, token.endOffset + fieldValueOffset, token.value));
            }
        }
        // add 1 for the fieldvalue separator character
        fieldValueOffset += fieldValueAnnotations.textMinusMarkup.length() + 1;
    }
    return intersectingAnnotations.toArray(new AnnotationToken[intersectingAnnotations.size()]);
}
Also used : AnnotatedText(org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText) ArrayList(java.util.ArrayList) AnnotationToken(org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken)

Example 3 with AnnotationToken

use of org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken in project OpenSearch by opensearch-project.

the class AnnotatedPassageFormatter method format.

@Override
public Snippet[] format(Passage[] passages, String content) {
    Snippet[] snippets = new Snippet[passages.length];
    int pos;
    int j = 0;
    for (Passage passage : passages) {
        AnnotationToken[] annotations = getIntersectingAnnotations(passage.getStartOffset(), passage.getEndOffset());
        MarkupPassage mergedMarkup = mergeAnnotations(annotations, passage);
        StringBuilder sb = new StringBuilder();
        pos = passage.getStartOffset();
        for (Markup markup : mergedMarkup.markups) {
            int start = markup.start;
            int end = markup.end;
            // its possible to have overlapping terms
            if (start > pos) {
                append(sb, content, pos, start);
            }
            if (end > pos) {
                sb.append("[");
                append(sb, content, Math.max(pos, start), end);
                sb.append("](");
                sb.append(markup.metadata);
                sb.append(")");
                pos = end;
            }
        }
        // its possible a "term" from the analyzer could span a sentence boundary.
        append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
        // we remove the paragraph separator if present at the end of the snippet (we used it as separator between values)
        if (sb.charAt(sb.length() - 1) == HighlightUtils.PARAGRAPH_SEPARATOR) {
            sb.deleteCharAt(sb.length() - 1);
        } else if (sb.charAt(sb.length() - 1) == HighlightUtils.NULL_SEPARATOR) {
            sb.deleteCharAt(sb.length() - 1);
        }
        // and we trim the snippets too
        snippets[j++] = new Snippet(sb.toString().trim(), passage.getScore(), passage.getNumMatches() > 0);
    }
    return snippets;
}
Also used : Snippet(org.apache.lucene.search.uhighlight.Snippet) Passage(org.apache.lucene.search.uhighlight.Passage) AnnotationToken(org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken)

Example 4 with AnnotationToken

use of org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken in project OpenSearch by opensearch-project.

the class AnnotatedTextParsingTests method checkParsing.

private void checkParsing(String markup, String expectedPlainText, AnnotationToken... expectedTokens) {
    AnnotatedText at = AnnotatedText.parse(markup);
    assertEquals(expectedPlainText, at.textMinusMarkup);
    List<AnnotationToken> actualAnnotations = at.annotations;
    assertEquals(expectedTokens.length, actualAnnotations.size());
    for (int i = 0; i < expectedTokens.length; i++) {
        assertEquals(expectedTokens[i], actualAnnotations.get(i));
    }
}
Also used : AnnotatedText(org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText) AnnotationToken(org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken)

Example 5 with AnnotationToken

use of org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken in project OpenSearch by opensearch-project.

the class AnnotatedTextParsingTests method testAnnotationWithType.

public void testAnnotationWithType() {
    Exception expectedException = expectThrows(OpenSearchParseException.class, () -> checkParsing("foo [bar](type=foo) baz", "foo bar baz", new AnnotationToken(4, 7, "noType")));
    assertThat(expectedException.getMessage(), equalTo("key=value pairs are not supported in annotations"));
}
Also used : OpenSearchParseException(org.opensearch.OpenSearchParseException) AnnotationToken(org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken)

Aggregations

AnnotationToken (org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText.AnnotationToken)5 AnnotatedText (org.opensearch.index.mapper.annotatedtext.AnnotatedTextFieldMapper.AnnotatedText)2 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 ArrayList (java.util.ArrayList)1 Passage (org.apache.lucene.search.uhighlight.Passage)1 Snippet (org.apache.lucene.search.uhighlight.Snippet)1 OpenSearchParseException (org.opensearch.OpenSearchParseException)1