Search in sources :

Example 1 with Snippet

use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.

the class PostingsHighlighter method highlight.

@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
    FieldMapper fieldMapper = highlighterContext.mapper;
    SearchContextHighlight.Field field = highlighterContext.field;
    if (canHighlight(fieldMapper) == false) {
        throw new IllegalArgumentException("the field [" + highlighterContext.fieldName + "] should be indexed with positions and offsets in the postings list to be used with postings highlighter");
    }
    SearchContext context = highlighterContext.context;
    FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
    if (!hitContext.cache().containsKey(CACHE_KEY)) {
        hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
    }
    HighlighterEntry highlighterEntry = (HighlighterEntry) hitContext.cache().get(CACHE_KEY);
    MapperHighlighterEntry mapperHighlighterEntry = highlighterEntry.mappers.get(fieldMapper);
    if (mapperHighlighterEntry == null) {
        Encoder encoder = field.fieldOptions().encoder().equals("html") ? Encoders.HTML : Encoders.DEFAULT;
        CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0], encoder);
        mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter);
    }
    List<Snippet> snippets = new ArrayList<>();
    int numberOfFragments;
    try {
        Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
        List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext);
        CustomPostingsHighlighter highlighter;
        if (field.fieldOptions().numberOfFragments() == 0) {
            //we use a control char to separate values, which is the only char that the custom break iterator breaks the text on,
            //so we don't lose the distinction between the different values of a field and we get back a snippet per value
            String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.NULL_SEPARATOR);
            CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(HighlightUtils.NULL_SEPARATOR);
            highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter, breakIterator, fieldValue, field.fieldOptions().noMatchSize() > 0);
            //we are highlighting the whole content, one snippet per value
            numberOfFragments = fieldValues.size();
        } else {
            //using paragraph separator we make sure that each field value holds a discrete passage for highlighting
            String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.PARAGRAPH_SEPARATOR);
            highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter, fieldValue, field.fieldOptions().noMatchSize() > 0);
            numberOfFragments = field.fieldOptions().numberOfFragments();
        }
        IndexSearcher searcher = new IndexSearcher(hitContext.reader());
        Snippet[] fieldSnippets = highlighter.highlightField(fieldMapper.fieldType().name(), highlighterContext.query, searcher, hitContext.docId(), numberOfFragments);
        for (Snippet fieldSnippet : fieldSnippets) {
            if (Strings.hasText(fieldSnippet.getText())) {
                snippets.add(fieldSnippet);
            }
        }
    } catch (IOException e) {
        throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
    }
    snippets = filterSnippets(snippets, field.fieldOptions().numberOfFragments());
    if (field.fieldOptions().scoreOrdered()) {
        //let's sort the snippets by score if needed
        CollectionUtil.introSort(snippets, new Comparator<Snippet>() {

            @Override
            public int compare(Snippet o1, Snippet o2) {
                return (int) Math.signum(o2.getScore() - o1.getScore());
            }
        });
    }
    String[] fragments = new String[snippets.size()];
    for (int i = 0; i < fragments.length; i++) {
        fragments[i] = snippets.get(i).getText();
    }
    if (fragments.length > 0) {
        return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
    }
    return null;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) ArrayList(java.util.ArrayList) SearchContext(org.elasticsearch.search.internal.SearchContext) Analyzer(org.apache.lucene.analysis.Analyzer) CustomSeparatorBreakIterator(org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator) CustomPostingsHighlighter(org.apache.lucene.search.postingshighlight.CustomPostingsHighlighter) Encoder(org.apache.lucene.search.highlight.Encoder) FetchSubPhase(org.elasticsearch.search.fetch.FetchSubPhase) CustomPassageFormatter(org.apache.lucene.search.postingshighlight.CustomPassageFormatter) Snippet(org.apache.lucene.search.highlight.Snippet) IOException(java.io.IOException) FetchPhaseExecutionException(org.elasticsearch.search.fetch.FetchPhaseExecutionException) FieldMapper(org.elasticsearch.index.mapper.FieldMapper)

Example 2 with Snippet

use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.

the class PostingsHighlighter method filterSnippets.

static List<Snippet> filterSnippets(List<Snippet> snippets, int numberOfFragments) {
    //We need to filter the snippets as due to no_match_size we could have
    //either highlighted snippets or non highlighted ones and we don't want to mix those up
    List<Snippet> filteredSnippets = new ArrayList<>(snippets.size());
    for (Snippet snippet : snippets) {
        if (snippet.isHighlighted()) {
            filteredSnippets.add(snippet);
        }
    }
    //otherwise we return the first non highlighted one if available
    if (filteredSnippets.size() == 0) {
        if (snippets.size() > 0) {
            Snippet snippet = snippets.get(0);
            //we need to return the first sentence of the content rather than the whole content
            if (numberOfFragments == 0) {
                BreakIterator bi = BreakIterator.getSentenceInstance(Locale.ROOT);
                String text = snippet.getText();
                bi.setText(text);
                int next = bi.next();
                if (next != BreakIterator.DONE) {
                    String newText = text.substring(0, next).trim();
                    snippet = new Snippet(newText, snippet.getScore(), snippet.isHighlighted());
                }
            }
            filteredSnippets.add(snippet);
        }
    }
    return filteredSnippets;
}
Also used : ArrayList(java.util.ArrayList) Snippet(org.apache.lucene.search.highlight.Snippet) CustomSeparatorBreakIterator(org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator) BreakIterator(java.text.BreakIterator)

Example 3 with Snippet

use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.

the class UnifiedHighlighter method highlight.

@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
    FieldMapper fieldMapper = highlighterContext.mapper;
    SearchContextHighlight.Field field = highlighterContext.field;
    SearchContext context = highlighterContext.context;
    FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
    if (!hitContext.cache().containsKey(CACHE_KEY)) {
        hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
    }
    HighlighterEntry highlighterEntry = (HighlighterEntry) hitContext.cache().get(CACHE_KEY);
    MapperHighlighterEntry mapperHighlighterEntry = highlighterEntry.mappers.get(fieldMapper);
    if (mapperHighlighterEntry == null) {
        Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
        CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0], encoder);
        mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter);
    }
    List<Snippet> snippets = new ArrayList<>();
    int numberOfFragments;
    try {
        Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
        List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext);
        fieldValues = fieldValues.stream().map(obj -> {
            if (obj instanceof BytesRef) {
                return fieldMapper.fieldType().valueForDisplay(obj).toString();
            } else {
                return obj;
            }
        }).collect(Collectors.toList());
        IndexSearcher searcher = new IndexSearcher(hitContext.reader());
        CustomUnifiedHighlighter highlighter;
        if (field.fieldOptions().numberOfFragments() == 0) {
            // we use a control char to separate values, which is the only char that the custom break iterator
            // breaks the text on, so we don't lose the distinction between the different values of a field and we
            // get back a snippet per value
            String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
            org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator breakIterator = new org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
            highlighter = new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
            // we are highlighting the whole content, one snippet per value
            numberOfFragments = fieldValues.size();
        } else {
            //using paragraph separator we make sure that each field value holds a discrete passage for highlighting
            String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
            BreakIterator bi = getBreakIterator(field);
            highlighter = new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), bi, fieldValue, field.fieldOptions().noMatchSize());
            numberOfFragments = field.fieldOptions().numberOfFragments();
        }
        if (field.fieldOptions().requireFieldMatch()) {
            final String fieldName = highlighterContext.fieldName;
            highlighter.setFieldMatcher((name) -> fieldName.equals(name));
        } else {
            highlighter.setFieldMatcher((name) -> true);
        }
        Snippet[] fieldSnippets = highlighter.highlightField(highlighterContext.fieldName, highlighterContext.query, hitContext.docId(), numberOfFragments);
        for (Snippet fieldSnippet : fieldSnippets) {
            if (Strings.hasText(fieldSnippet.getText())) {
                snippets.add(fieldSnippet);
            }
        }
    } catch (IOException e) {
        throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
    }
    snippets = filterSnippets(snippets, field.fieldOptions().numberOfFragments());
    if (field.fieldOptions().scoreOrdered()) {
        //let's sort the snippets by score if needed
        CollectionUtil.introSort(snippets, (o1, o2) -> Double.compare(o2.getScore(), o1.getScore()));
    }
    String[] fragments = new String[snippets.size()];
    for (int i = 0; i < fragments.length; i++) {
        fragments[i] = snippets.get(i).getText();
    }
    if (fragments.length > 0) {
        return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
    }
    return null;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) ArrayList(java.util.ArrayList) SearchContext(org.elasticsearch.search.internal.SearchContext) Analyzer(org.apache.lucene.analysis.Analyzer) BreakIterator(java.text.BreakIterator) Encoder(org.apache.lucene.search.highlight.Encoder) FetchSubPhase(org.elasticsearch.search.fetch.FetchSubPhase) CustomPassageFormatter(org.apache.lucene.search.uhighlight.CustomPassageFormatter) BytesRef(org.apache.lucene.util.BytesRef) CustomUnifiedHighlighter(org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter) Snippet(org.apache.lucene.search.highlight.Snippet) IOException(java.io.IOException) FetchPhaseExecutionException(org.elasticsearch.search.fetch.FetchPhaseExecutionException) FieldMapper(org.elasticsearch.index.mapper.FieldMapper)

Example 4 with Snippet

use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.

the class CustomPassageFormatterTests method testHtmlEncodeFormat.

public void testHtmlEncodeFormat() {
    String content = "<b>This is a really cool highlighter.</b> Unified highlighter gives nice snippets back.";
    CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new SimpleHTMLEncoder());
    Passage[] passages = new Passage[2];
    String match = "highlighter";
    BytesRef matchBytesRef = new BytesRef(match);
    Passage passage1 = new Passage();
    int start = content.indexOf(match);
    int end = start + match.length();
    passage1.setStartOffset(0);
    //lets include the whitespace at the end to make sure we trim it
    passage1.setEndOffset(end + 6);
    passage1.addMatch(start, end, matchBytesRef);
    passages[0] = passage1;
    Passage passage2 = new Passage();
    start = content.lastIndexOf(match);
    end = start + match.length();
    passage2.setStartOffset(passage1.getEndOffset());
    passage2.setEndOffset(content.length());
    passage2.addMatch(start, end, matchBytesRef);
    passages[1] = passage2;
    Snippet[] fragments = passageFormatter.format(passages, content);
    assertThat(fragments, notNullValue());
    assertThat(fragments.length, equalTo(2));
    assertThat(fragments[0].getText(), equalTo("&lt;b&gt;This is a really cool <em>highlighter</em>.&lt;&#x2F;b&gt;"));
    assertThat(fragments[1].getText(), equalTo("Unified <em>highlighter</em> gives nice snippets back."));
}
Also used : SimpleHTMLEncoder(org.apache.lucene.search.highlight.SimpleHTMLEncoder) Snippet(org.apache.lucene.search.highlight.Snippet) BytesRef(org.apache.lucene.util.BytesRef)

Example 5 with Snippet

use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.

the class CustomPassageFormatterTests method testSimpleFormat.

public void testSimpleFormat() {
    String content = "This is a really cool highlighter. Unified highlighter gives nice snippets back. No matches here.";
    CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new DefaultEncoder());
    Passage[] passages = new Passage[3];
    String match = "highlighter";
    BytesRef matchBytesRef = new BytesRef(match);
    Passage passage1 = new Passage();
    int start = content.indexOf(match);
    int end = start + match.length();
    passage1.setStartOffset(0);
    //lets include the whitespace at the end to make sure we trim it
    passage1.setEndOffset(end + 2);
    passage1.addMatch(start, end, matchBytesRef);
    passages[0] = passage1;
    Passage passage2 = new Passage();
    start = content.lastIndexOf(match);
    end = start + match.length();
    passage2.setStartOffset(passage1.getEndOffset());
    passage2.setEndOffset(end + 26);
    passage2.addMatch(start, end, matchBytesRef);
    passages[1] = passage2;
    Passage passage3 = new Passage();
    passage3.setStartOffset(passage2.getEndOffset());
    passage3.setEndOffset(content.length());
    passages[2] = passage3;
    Snippet[] fragments = passageFormatter.format(passages, content);
    assertThat(fragments, notNullValue());
    assertThat(fragments.length, equalTo(3));
    assertThat(fragments[0].getText(), equalTo("This is a really cool <em>highlighter</em>."));
    assertThat(fragments[0].isHighlighted(), equalTo(true));
    assertThat(fragments[1].getText(), equalTo("Unified <em>highlighter</em> gives nice snippets back."));
    assertThat(fragments[1].isHighlighted(), equalTo(true));
    assertThat(fragments[2].getText(), equalTo("No matches here."));
    assertThat(fragments[2].isHighlighted(), equalTo(false));
}
Also used : DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) Snippet(org.apache.lucene.search.highlight.Snippet) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

Snippet (org.apache.lucene.search.highlight.Snippet)12 IndexSearcher (org.apache.lucene.search.IndexSearcher)5 DefaultEncoder (org.apache.lucene.search.highlight.DefaultEncoder)5 BytesRef (org.apache.lucene.util.BytesRef)5 ArrayList (java.util.ArrayList)3 Document (org.apache.lucene.document.Document)3 Field (org.apache.lucene.document.Field)3 FieldType (org.apache.lucene.document.FieldType)3 TextField (org.apache.lucene.document.TextField)3 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)3 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)3 TopDocs (org.apache.lucene.search.TopDocs)3 Directory (org.apache.lucene.store.Directory)3 IOException (java.io.IOException)2 BreakIterator (java.text.BreakIterator)2 Analyzer (org.apache.lucene.analysis.Analyzer)2 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 IndexReader (org.apache.lucene.index.IndexReader)2 Term (org.apache.lucene.index.Term)2 Query (org.apache.lucene.search.Query)2