Search in sources :

Example 6 with Snippet

use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.

the class CustomUnifiedHighlighterTests method assertHighlightOneDoc.

private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer analyzer, Query query, Locale locale, BreakIterator breakIterator, int noMatchSize, String[] expectedPassages) throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
    iwc.setMergePolicy(newTieredMergePolicy(random()));
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
    FieldType ft = new FieldType(TextField.TYPE_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    ft.freeze();
    Document doc = new Document();
    for (String input : inputs) {
        Field field = new Field(fieldName, "", ft);
        field.setStringValue(input);
        doc.add(field);
    }
    iw.addDocument(doc);
    DirectoryReader reader = iw.getReader();
    IndexSearcher searcher = newSearcher(reader);
    iw.close();
    TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
    assertThat(topDocs.totalHits, equalTo(1));
    String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
    CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale, breakIterator, rawValue, noMatchSize);
    highlighter.setFieldMatcher((name) -> "text".equals(name));
    final Snippet[] snippets = highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
    assertEquals(snippets.length, expectedPassages.length);
    for (int i = 0; i < snippets.length; i++) {
        assertEquals(snippets[i].getText(), expectedPassages[i]);
    }
    reader.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) DirectoryReader(org.apache.lucene.index.DirectoryReader) Snippet(org.apache.lucene.search.highlight.Snippet) Document(org.apache.lucene.document.Document) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) FieldType(org.apache.lucene.document.FieldType) TopDocs(org.apache.lucene.search.TopDocs) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 7 with Snippet

use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.

the class CustomPassageFormatterTests method testHtmlEncodeFormat.

public void testHtmlEncodeFormat() {
    String content = "<b>This is a really cool highlighter.</b> Postings highlighter gives nice snippets back.";
    CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new SimpleHTMLEncoder());
    Passage[] passages = new Passage[2];
    String match = "highlighter";
    BytesRef matchBytesRef = new BytesRef(match);
    Passage passage1 = new Passage();
    int start = content.indexOf(match);
    int end = start + match.length();
    passage1.startOffset = 0;
    //lets include the whitespace at the end to make sure we trim it
    passage1.endOffset = end + 6;
    passage1.addMatch(start, end, matchBytesRef);
    passages[0] = passage1;
    Passage passage2 = new Passage();
    start = content.lastIndexOf(match);
    end = start + match.length();
    passage2.startOffset = passage1.endOffset;
    passage2.endOffset = content.length();
    passage2.addMatch(start, end, matchBytesRef);
    passages[1] = passage2;
    Snippet[] fragments = passageFormatter.format(passages, content);
    assertThat(fragments, notNullValue());
    assertThat(fragments.length, equalTo(2));
    assertThat(fragments[0].getText(), equalTo("&lt;b&gt;This is a really cool <em>highlighter</em>.&lt;&#x2F;b&gt;"));
    assertThat(fragments[1].getText(), equalTo("Postings <em>highlighter</em> gives nice snippets back."));
}
Also used : SimpleHTMLEncoder(org.apache.lucene.search.highlight.SimpleHTMLEncoder) Snippet(org.apache.lucene.search.highlight.Snippet) BytesRef(org.apache.lucene.util.BytesRef)

Example 8 with Snippet

use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.

the class CustomPostingsHighlighterTests method testCustomPostingsHighlighter.

public void testCustomPostingsHighlighter() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    //good position but only one match
    final String firstValue = "This is a test. Just a test1 highlighting from postings highlighter.";
    Field body = new Field("body", "", offsetsType);
    Document doc = new Document();
    doc.add(body);
    body.setStringValue(firstValue);
    //two matches, not the best snippet due to its length though
    final String secondValue = "This is the second highlighting value to perform highlighting on a longer text that gets scored lower.";
    Field body2 = new Field("body", "", offsetsType);
    doc.add(body2);
    body2.setStringValue(secondValue);
    //two matches and short, will be scored highest
    final String thirdValue = "This is highlighting the third short highlighting value.";
    Field body3 = new Field("body", "", offsetsType);
    doc.add(body3);
    body3.setStringValue(thirdValue);
    //one match, same as first but at the end, will be scored lower due to its position
    final String fourthValue = "Just a test4 highlighting from postings highlighter.";
    Field body4 = new Field("body", "", offsetsType);
    doc.add(body4);
    body4.setStringValue(fourthValue);
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    String firstHlValue = "Just a test1 <b>highlighting</b> from postings highlighter.";
    String secondHlValue = "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a longer text that gets scored lower.";
    String thirdHlValue = "This is <b>highlighting</b> the third short <b>highlighting</b> value.";
    String fourthHlValue = "Just a test4 <b>highlighting</b> from postings highlighter.";
    IndexSearcher searcher = newSearcher(ir);
    Query query = new TermQuery(new Term("body", "highlighting"));
    TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
    assertThat(topDocs.totalHits, equalTo(1));
    int docId = topDocs.scoreDocs[0].doc;
    String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue + HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue;
    CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(null, new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), fieldValue, false);
    Snippet[] snippets = highlighter.highlightField("body", query, searcher, docId, 5);
    assertThat(snippets.length, equalTo(4));
    assertThat(snippets[0].getText(), equalTo(firstHlValue));
    assertThat(snippets[1].getText(), equalTo(secondHlValue));
    assertThat(snippets[2].getText(), equalTo(thirdHlValue));
    assertThat(snippets[3].getText(), equalTo(fourthHlValue));
    ir.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) Term(org.apache.lucene.index.Term) Snippet(org.apache.lucene.search.highlight.Snippet) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TopDocs(org.apache.lucene.search.TopDocs) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) IndexReader(org.apache.lucene.index.IndexReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 9 with Snippet

use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.

the class CustomPassageFormatter method format.

@Override
public Snippet[] format(Passage[] passages, String content) {
    Snippet[] snippets = new Snippet[passages.length];
    int pos;
    for (int j = 0; j < passages.length; j++) {
        Passage passage = passages[j];
        StringBuilder sb = new StringBuilder();
        pos = passage.getStartOffset();
        for (int i = 0; i < passage.getNumMatches(); i++) {
            int start = passage.getMatchStarts()[i];
            int end = passage.getMatchEnds()[i];
            // its possible to have overlapping terms
            if (start > pos) {
                append(sb, content, pos, start);
            }
            if (end > pos) {
                sb.append(preTag);
                append(sb, content, Math.max(pos, start), end);
                sb.append(postTag);
                pos = end;
            }
        }
        // its possible a "term" from the analyzer could span a sentence boundary.
        append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
        //we remove the paragraph separator if present at the end of the snippet (we used it as separator between values)
        if (sb.charAt(sb.length() - 1) == HighlightUtils.PARAGRAPH_SEPARATOR) {
            sb.deleteCharAt(sb.length() - 1);
        } else if (sb.charAt(sb.length() - 1) == HighlightUtils.NULL_SEPARATOR) {
            sb.deleteCharAt(sb.length() - 1);
        }
        //and we trim the snippets too
        snippets[j] = new Snippet(sb.toString().trim(), passage.getScore(), passage.getNumMatches() > 0);
    }
    return snippets;
}
Also used : Snippet(org.apache.lucene.search.highlight.Snippet)

Example 10 with Snippet

use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.

the class CustomPassageFormatter method format.

@Override
public Snippet[] format(Passage[] passages, String content) {
    Snippet[] snippets = new Snippet[passages.length];
    int pos;
    for (int j = 0; j < passages.length; j++) {
        Passage passage = passages[j];
        StringBuilder sb = new StringBuilder();
        pos = passage.getStartOffset();
        for (int i = 0; i < passage.getNumMatches(); i++) {
            int start = passage.getMatchStarts()[i];
            int end = passage.getMatchEnds()[i];
            // its possible to have overlapping terms
            if (start > pos) {
                append(sb, content, pos, start);
            }
            if (end > pos) {
                sb.append(preTag);
                append(sb, content, Math.max(pos, start), end);
                sb.append(postTag);
                pos = end;
            }
        }
        // its possible a "term" from the analyzer could span a sentence boundary.
        append(sb, content, pos, Math.max(pos, passage.getEndOffset()));
        //we remove the paragraph separator if present at the end of the snippet (we used it as separator between values)
        if (sb.charAt(sb.length() - 1) == HighlightUtils.PARAGRAPH_SEPARATOR) {
            sb.deleteCharAt(sb.length() - 1);
        } else if (sb.charAt(sb.length() - 1) == HighlightUtils.NULL_SEPARATOR) {
            sb.deleteCharAt(sb.length() - 1);
        }
        //and we trim the snippets too
        snippets[j] = new Snippet(sb.toString().trim(), passage.getScore(), passage.getNumMatches() > 0);
    }
    return snippets;
}
Also used : Snippet(org.apache.lucene.search.highlight.Snippet)

Aggregations

Snippet (org.apache.lucene.search.highlight.Snippet)12 IndexSearcher (org.apache.lucene.search.IndexSearcher)5 DefaultEncoder (org.apache.lucene.search.highlight.DefaultEncoder)5 BytesRef (org.apache.lucene.util.BytesRef)5 ArrayList (java.util.ArrayList)3 Document (org.apache.lucene.document.Document)3 Field (org.apache.lucene.document.Field)3 FieldType (org.apache.lucene.document.FieldType)3 TextField (org.apache.lucene.document.TextField)3 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)3 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)3 TopDocs (org.apache.lucene.search.TopDocs)3 Directory (org.apache.lucene.store.Directory)3 IOException (java.io.IOException)2 BreakIterator (java.text.BreakIterator)2 Analyzer (org.apache.lucene.analysis.Analyzer)2 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 IndexReader (org.apache.lucene.index.IndexReader)2 Term (org.apache.lucene.index.Term)2 Query (org.apache.lucene.search.Query)2