Search in sources :

Example 1 with CustomSeparatorBreakIterator

use of org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator in project OpenSearch by opensearch-project.

the class AnnotatedTextHighlighterTests method testAnnotatedTextStructuredMatch.

public void testAnnotatedTextStructuredMatch() throws Exception {
    // Check that a structured token eg a URL can be highlighted in a query
    // on marked-up
    // content using an "annotated_text" type field.
    String url = "https://en.wikipedia.org/wiki/Key_Word_in_Context";
    String encodedUrl = URLEncoder.encode(url, "UTF-8");
    String annotatedWord = "[highlighting](" + encodedUrl + ")";
    String highlightedAnnotatedWord = "[highlighting](" + AnnotatedPassageFormatter.SEARCH_HIT_TYPE + "=" + encodedUrl + "&" + encodedUrl + ")";
    final String[] markedUpInputs = { "This is a test. Just a test1 " + annotatedWord + " from [annotated](bar) highlighter.", "This is the second " + annotatedWord + " value to perform highlighting on a longer text that gets scored lower." };
    String[] expectedPassages = { "This is a test. Just a test1 " + highlightedAnnotatedWord + " from [annotated](bar) highlighter.", "This is the second " + highlightedAnnotatedWord + " value to perform highlighting on a" + " longer text that gets scored lower." };
    Query query = new TermQuery(new Term("text", url));
    BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
    assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
}
Also used : TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) PhraseQuery(org.apache.lucene.search.PhraseQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) TermQuery(org.apache.lucene.search.TermQuery) Term(org.apache.lucene.index.Term) CustomSeparatorBreakIterator(org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator) SplittingBreakIterator(org.apache.lucene.search.uhighlight.SplittingBreakIterator) BreakIterator(java.text.BreakIterator) CustomSeparatorBreakIterator(org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator)

Example 2 with CustomSeparatorBreakIterator

use of org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator in project OpenSearch by opensearch-project.

the class AnnotatedTextHighlighterTests method testAnnotatedTextSingleFieldWithBreakIterator.

public void testAnnotatedTextSingleFieldWithBreakIterator() throws Exception {
    final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore. Kim shook hands with Donald" };
    String[] expectedPassages = { "[Donald](_hit_term=donald) Trump visited Singapore", "Kim shook hands with [Donald](_hit_term=donald)" };
    Query query = new TermQuery(new Term("text", "donald"));
    BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
    breakIterator = new SplittingBreakIterator(breakIterator, '.');
    assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
}
Also used : TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) PhraseQuery(org.apache.lucene.search.PhraseQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) TermQuery(org.apache.lucene.search.TermQuery) SplittingBreakIterator(org.apache.lucene.search.uhighlight.SplittingBreakIterator) Term(org.apache.lucene.index.Term) CustomSeparatorBreakIterator(org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator) SplittingBreakIterator(org.apache.lucene.search.uhighlight.SplittingBreakIterator) BreakIterator(java.text.BreakIterator) CustomSeparatorBreakIterator(org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator)

Example 3 with CustomSeparatorBreakIterator

use of org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator in project OpenSearch by opensearch-project.

the class UnifiedHighlighter method buildHighlighter.

CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) throws IOException {
    Encoder encoder = fieldContext.field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
    int maxAnalyzedOffset = fieldContext.context.getIndexSettings().getHighlightMaxAnalyzedOffset();
    int keywordIgnoreAbove = Integer.MAX_VALUE;
    if (fieldContext.fieldType instanceof KeywordFieldMapper.KeywordFieldType) {
        KeywordFieldMapper mapper = (KeywordFieldMapper) fieldContext.context.mapperService().documentMapper().mappers().getMapper(fieldContext.fieldName);
        keywordIgnoreAbove = mapper.ignoreAbove();
    }
    int numberOfFragments = fieldContext.field.fieldOptions().numberOfFragments();
    Analyzer analyzer = getAnalyzer(fieldContext.context.mapperService().documentMapper());
    PassageFormatter passageFormatter = getPassageFormatter(fieldContext.hitContext, fieldContext.field, encoder);
    IndexSearcher searcher = fieldContext.context.searcher();
    OffsetSource offsetSource = getOffsetSource(fieldContext.fieldType);
    BreakIterator breakIterator;
    int higlighterNumberOfFragments;
    if (numberOfFragments == 0 || // non-tokenized fields should not use any break iterator (ignore boundaryScannerType)
    fieldContext.fieldType.getTextSearchInfo().isTokenized() == false) {
        /*
             * We use a control char to separate values, which is the
             * only char that the custom break iterator breaks the text
             * on, so we don't lose the distinction between the different
             * values of a field and we get back a snippet per value
             */
        breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
        higlighterNumberOfFragments = numberOfFragments == 0 ? Integer.MAX_VALUE - 1 : numberOfFragments;
    } else {
        // using paragraph separator we make sure that each field value holds a discrete passage for highlighting
        breakIterator = getBreakIterator(fieldContext.field);
        higlighterNumberOfFragments = numberOfFragments;
    }
    return new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter, fieldContext.field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldContext.context.getIndexName(), fieldContext.fieldName, fieldContext.query, fieldContext.field.fieldOptions().noMatchSize(), higlighterNumberOfFragments, fieldMatcher(fieldContext), keywordIgnoreAbove, maxAnalyzedOffset);
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) KeywordFieldMapper(org.opensearch.index.mapper.KeywordFieldMapper) Encoder(org.apache.lucene.search.highlight.Encoder) CustomUnifiedHighlighter(org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter) Analyzer(org.apache.lucene.analysis.Analyzer) CustomPassageFormatter(org.apache.lucene.search.uhighlight.CustomPassageFormatter) PassageFormatter(org.apache.lucene.search.uhighlight.PassageFormatter) OffsetSource(org.apache.lucene.search.uhighlight.UnifiedHighlighter.OffsetSource) CustomSeparatorBreakIterator(org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator) BreakIterator(java.text.BreakIterator) CustomSeparatorBreakIterator(org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator)

Example 4 with CustomSeparatorBreakIterator

use of org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator in project OpenSearch by opensearch-project.

the class AnnotatedTextHighlighterTests method testBadAnnotation.

public void testBadAnnotation() throws Exception {
    final String[] markedUpInputs = { "Missing bracket for [Donald Trump](Donald+Trump visited Singapore" };
    String[] expectedPassages = { "Missing bracket for [Donald Trump](Donald+Trump visited [Singapore](_hit_term=singapore)" };
    Query query = new TermQuery(new Term("text", "singapore"));
    BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
    assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
}
Also used : TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) PhraseQuery(org.apache.lucene.search.PhraseQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) TermQuery(org.apache.lucene.search.TermQuery) Term(org.apache.lucene.index.Term) CustomSeparatorBreakIterator(org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator) SplittingBreakIterator(org.apache.lucene.search.uhighlight.SplittingBreakIterator) BreakIterator(java.text.BreakIterator) CustomSeparatorBreakIterator(org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator)

Example 5 with CustomSeparatorBreakIterator

use of org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator in project OpenSearch by opensearch-project.

the class AnnotatedTextHighlighterTests method testAnnotatedTextMultiFieldWithBreakIterator.

public void testAnnotatedTextMultiFieldWithBreakIterator() throws Exception {
    final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore. Kim shook hands with Donald", "Donald duck is a [Disney](Disney+Inc) invention" };
    String[] expectedPassages = { "[Donald](_hit_term=donald) Trump visited Singapore", "Kim shook hands with [Donald](_hit_term=donald)", "[Donald](_hit_term=donald) duck is a [Disney](Disney+Inc) invention" };
    Query query = new TermQuery(new Term("text", "donald"));
    BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
    breakIterator = new SplittingBreakIterator(breakIterator, '.');
    assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
}
Also used : TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) PhraseQuery(org.apache.lucene.search.PhraseQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) TermQuery(org.apache.lucene.search.TermQuery) SplittingBreakIterator(org.apache.lucene.search.uhighlight.SplittingBreakIterator) Term(org.apache.lucene.index.Term) CustomSeparatorBreakIterator(org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator) SplittingBreakIterator(org.apache.lucene.search.uhighlight.SplittingBreakIterator) BreakIterator(java.text.BreakIterator) CustomSeparatorBreakIterator(org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator)

Aggregations

BreakIterator (java.text.BreakIterator)7 CustomSeparatorBreakIterator (org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator)7 MatchAllDocsQuery (org.apache.lucene.search.MatchAllDocsQuery)6 PhraseQuery (org.apache.lucene.search.PhraseQuery)6 Query (org.apache.lucene.search.Query)6 TermQuery (org.apache.lucene.search.TermQuery)6 SplittingBreakIterator (org.apache.lucene.search.uhighlight.SplittingBreakIterator)6 Term (org.apache.lucene.index.Term)5 Analyzer (org.apache.lucene.analysis.Analyzer)1 IndexSearcher (org.apache.lucene.search.IndexSearcher)1 Encoder (org.apache.lucene.search.highlight.Encoder)1 CustomPassageFormatter (org.apache.lucene.search.uhighlight.CustomPassageFormatter)1 CustomUnifiedHighlighter (org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter)1 PassageFormatter (org.apache.lucene.search.uhighlight.PassageFormatter)1 OffsetSource (org.apache.lucene.search.uhighlight.UnifiedHighlighter.OffsetSource)1 KeywordFieldMapper (org.opensearch.index.mapper.KeywordFieldMapper)1