Search in sources :

Example 1 with DefaultEncoder

use of org.apache.lucene.search.highlight.DefaultEncoder in project elasticsearch by elastic.

the class CustomPassageFormatterTests method testSimpleFormat.

public void testSimpleFormat() {
    String content = "This is a really cool highlighter. Unified highlighter gives nice snippets back. No matches here.";
    CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new DefaultEncoder());
    Passage[] passages = new Passage[3];
    String match = "highlighter";
    BytesRef matchBytesRef = new BytesRef(match);
    Passage passage1 = new Passage();
    int start = content.indexOf(match);
    int end = start + match.length();
    passage1.setStartOffset(0);
    //lets include the whitespace at the end to make sure we trim it
    passage1.setEndOffset(end + 2);
    passage1.addMatch(start, end, matchBytesRef);
    passages[0] = passage1;
    Passage passage2 = new Passage();
    start = content.lastIndexOf(match);
    end = start + match.length();
    passage2.setStartOffset(passage1.getEndOffset());
    passage2.setEndOffset(end + 26);
    passage2.addMatch(start, end, matchBytesRef);
    passages[1] = passage2;
    Passage passage3 = new Passage();
    passage3.setStartOffset(passage2.getEndOffset());
    passage3.setEndOffset(content.length());
    passages[2] = passage3;
    Snippet[] fragments = passageFormatter.format(passages, content);
    assertThat(fragments, notNullValue());
    assertThat(fragments.length, equalTo(3));
    assertThat(fragments[0].getText(), equalTo("This is a really cool <em>highlighter</em>."));
    assertThat(fragments[0].isHighlighted(), equalTo(true));
    assertThat(fragments[1].getText(), equalTo("Unified <em>highlighter</em> gives nice snippets back."));
    assertThat(fragments[1].isHighlighted(), equalTo(true));
    assertThat(fragments[2].getText(), equalTo("No matches here."));
    assertThat(fragments[2].isHighlighted(), equalTo(false));
}
Also used : DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) Snippet(org.apache.lucene.search.highlight.Snippet) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with DefaultEncoder

use of org.apache.lucene.search.highlight.DefaultEncoder in project elasticsearch by elastic.

the class CustomUnifiedHighlighterTests method assertHighlightOneDoc.

private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer analyzer, Query query, Locale locale, BreakIterator breakIterator, int noMatchSize, String[] expectedPassages) throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
    iwc.setMergePolicy(newTieredMergePolicy(random()));
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
    FieldType ft = new FieldType(TextField.TYPE_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    ft.freeze();
    Document doc = new Document();
    for (String input : inputs) {
        Field field = new Field(fieldName, "", ft);
        field.setStringValue(input);
        doc.add(field);
    }
    iw.addDocument(doc);
    DirectoryReader reader = iw.getReader();
    IndexSearcher searcher = newSearcher(reader);
    iw.close();
    TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
    assertThat(topDocs.totalHits, equalTo(1));
    String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
    CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale, breakIterator, rawValue, noMatchSize);
    highlighter.setFieldMatcher((name) -> "text".equals(name));
    final Snippet[] snippets = highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
    assertEquals(snippets.length, expectedPassages.length);
    for (int i = 0; i < snippets.length; i++) {
        assertEquals(snippets[i].getText(), expectedPassages[i]);
    }
    reader.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) DirectoryReader(org.apache.lucene.index.DirectoryReader) Snippet(org.apache.lucene.search.highlight.Snippet) Document(org.apache.lucene.document.Document) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) FieldType(org.apache.lucene.document.FieldType) TopDocs(org.apache.lucene.search.TopDocs) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 3 with DefaultEncoder

use of org.apache.lucene.search.highlight.DefaultEncoder in project elasticsearch by elastic.

the class CustomPostingsHighlighterTests method testCustomPostingsHighlighter.

public void testCustomPostingsHighlighter() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
    iwc.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
    FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
    offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    //good position but only one match
    final String firstValue = "This is a test. Just a test1 highlighting from postings highlighter.";
    Field body = new Field("body", "", offsetsType);
    Document doc = new Document();
    doc.add(body);
    body.setStringValue(firstValue);
    //two matches, not the best snippet due to its length though
    final String secondValue = "This is the second highlighting value to perform highlighting on a longer text that gets scored lower.";
    Field body2 = new Field("body", "", offsetsType);
    doc.add(body2);
    body2.setStringValue(secondValue);
    //two matches and short, will be scored highest
    final String thirdValue = "This is highlighting the third short highlighting value.";
    Field body3 = new Field("body", "", offsetsType);
    doc.add(body3);
    body3.setStringValue(thirdValue);
    //one match, same as first but at the end, will be scored lower due to its position
    final String fourthValue = "Just a test4 highlighting from postings highlighter.";
    Field body4 = new Field("body", "", offsetsType);
    doc.add(body4);
    body4.setStringValue(fourthValue);
    iw.addDocument(doc);
    IndexReader ir = iw.getReader();
    iw.close();
    String firstHlValue = "Just a test1 <b>highlighting</b> from postings highlighter.";
    String secondHlValue = "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a longer text that gets scored lower.";
    String thirdHlValue = "This is <b>highlighting</b> the third short <b>highlighting</b> value.";
    String fourthHlValue = "Just a test4 <b>highlighting</b> from postings highlighter.";
    IndexSearcher searcher = newSearcher(ir);
    Query query = new TermQuery(new Term("body", "highlighting"));
    TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
    assertThat(topDocs.totalHits, equalTo(1));
    int docId = topDocs.scoreDocs[0].doc;
    String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue + HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue;
    CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(null, new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), fieldValue, false);
    Snippet[] snippets = highlighter.highlightField("body", query, searcher, docId, 5);
    assertThat(snippets.length, equalTo(4));
    assertThat(snippets[0].getText(), equalTo(firstHlValue));
    assertThat(snippets[1].getText(), equalTo(secondHlValue));
    assertThat(snippets[2].getText(), equalTo(thirdHlValue));
    assertThat(snippets[3].getText(), equalTo(fourthHlValue));
    ir.close();
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) Term(org.apache.lucene.index.Term) Snippet(org.apache.lucene.search.highlight.Snippet) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TopDocs(org.apache.lucene.search.TopDocs) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) IndexReader(org.apache.lucene.index.IndexReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 4 with DefaultEncoder

use of org.apache.lucene.search.highlight.DefaultEncoder in project lucene-solr by apache.

the class FastVectorHighlighterTest method matchedFieldsTestCase.

private void matchedFieldsTestCase(boolean useMatchedFields, boolean fieldMatch, String fieldValue, String expected, Query... queryClauses) throws IOException {
    Document doc = new Document();
    FieldType stored = new FieldType(TextField.TYPE_STORED);
    stored.setStoreTermVectorOffsets(true);
    stored.setStoreTermVectorPositions(true);
    stored.setStoreTermVectors(true);
    stored.freeze();
    FieldType matched = new FieldType(TextField.TYPE_NOT_STORED);
    matched.setStoreTermVectorOffsets(true);
    matched.setStoreTermVectorPositions(true);
    matched.setStoreTermVectors(true);
    matched.freeze();
    // Whitespace tokenized with English stop words
    doc.add(new Field("field", fieldValue, stored));
    // Whitespace tokenized without stop words
    doc.add(new Field("field_exact", fieldValue, matched));
    // Whitespace tokenized without toLower
    doc.add(new Field("field_super_exact", fieldValue, matched));
    // Each letter is a token
    doc.add(new Field("field_characters", fieldValue, matched));
    // Every three letters is a token
    doc.add(new Field("field_tripples", fieldValue, matched));
    doc.add(new Field("field_sliced", // Sliced at 10 chars then analyzed just like field
    fieldValue.substring(// Sliced at 10 chars then analyzed just like field
    0, Math.min(fieldValue.length() - 1, 10)), matched));
    doc.add(new Field("field_der_red", new // Hacky field containing "der" and "red" at pos = 0
    CannedTokenStream(token("der", 1, 0, 3), token("red", 0, 0, 3)), matched));
    final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>();
    fieldAnalyzers.put("field", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET));
    fieldAnalyzers.put("field_exact", new MockAnalyzer(random()));
    fieldAnalyzers.put("field_super_exact", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
    fieldAnalyzers.put("field_characters", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp(".").toAutomaton()), true));
    fieldAnalyzers.put("field_tripples", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp("...").toAutomaton()), true));
    fieldAnalyzers.put("field_sliced", fieldAnalyzers.get("field"));
    // This is required even though we provide a token stream
    fieldAnalyzers.put("field_der_red", fieldAnalyzers.get("field"));
    Analyzer analyzer = new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {

        public Analyzer getWrappedAnalyzer(String fieldName) {
            return fieldAnalyzers.get(fieldName);
        }
    };
    Directory dir = newDirectory();
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
    writer.addDocument(doc);
    FastVectorHighlighter highlighter = new FastVectorHighlighter();
    FragListBuilder fragListBuilder = new SimpleFragListBuilder();
    FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder();
    IndexReader reader = DirectoryReader.open(writer);
    String[] preTags = new String[] { "<b>" };
    String[] postTags = new String[] { "</b>" };
    Encoder encoder = new DefaultEncoder();
    int docId = 0;
    BooleanQuery.Builder query = new BooleanQuery.Builder();
    for (Query clause : queryClauses) {
        query.add(clause, Occur.MUST);
    }
    FieldQuery fieldQuery = new FieldQuery(query.build(), reader, true, fieldMatch);
    String[] bestFragments;
    if (useMatchedFields) {
        Set<String> matchedFields = new HashSet<>();
        matchedFields.add("field");
        matchedFields.add("field_exact");
        matchedFields.add("field_super_exact");
        matchedFields.add("field_characters");
        matchedFields.add("field_tripples");
        matchedFields.add("field_sliced");
        matchedFields.add("field_der_red");
        bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", matchedFields, 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
    } else {
        bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
    }
    assertEquals(expected, bestFragments[0]);
    reader.close();
    writer.close();
    dir.close();
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) CommonTermsQuery(org.apache.lucene.queries.CommonTermsQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) CustomScoreQuery(org.apache.lucene.queries.CustomScoreQuery) TermQuery(org.apache.lucene.search.TermQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) BoostQuery(org.apache.lucene.search.BoostQuery) CharacterRunAutomaton(org.apache.lucene.util.automaton.CharacterRunAutomaton) Document(org.apache.lucene.document.Document) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StoredField(org.apache.lucene.document.StoredField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) Encoder(org.apache.lucene.search.highlight.Encoder) DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Directory(org.apache.lucene.store.Directory) HashSet(java.util.HashSet) RegExp(org.apache.lucene.util.automaton.RegExp) TreeMap(java.util.TreeMap) FieldType(org.apache.lucene.document.FieldType) DelegatingAnalyzerWrapper(org.apache.lucene.analysis.DelegatingAnalyzerWrapper) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader)

Example 5 with DefaultEncoder

use of org.apache.lucene.search.highlight.DefaultEncoder in project elasticsearch by elastic.

the class CustomPassageFormatterTests method testSimpleFormat.

public void testSimpleFormat() {
    String content = "This is a really cool highlighter. Postings highlighter gives nice snippets back. No matches here.";
    CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new DefaultEncoder());
    Passage[] passages = new Passage[3];
    String match = "highlighter";
    BytesRef matchBytesRef = new BytesRef(match);
    Passage passage1 = new Passage();
    int start = content.indexOf(match);
    int end = start + match.length();
    passage1.startOffset = 0;
    //lets include the whitespace at the end to make sure we trim it
    passage1.endOffset = end + 2;
    passage1.addMatch(start, end, matchBytesRef);
    passages[0] = passage1;
    Passage passage2 = new Passage();
    start = content.lastIndexOf(match);
    end = start + match.length();
    passage2.startOffset = passage1.endOffset;
    passage2.endOffset = end + 26;
    passage2.addMatch(start, end, matchBytesRef);
    passages[1] = passage2;
    Passage passage3 = new Passage();
    passage3.startOffset = passage2.endOffset;
    passage3.endOffset = content.length();
    passages[2] = passage3;
    Snippet[] fragments = passageFormatter.format(passages, content);
    assertThat(fragments, notNullValue());
    assertThat(fragments.length, equalTo(3));
    assertThat(fragments[0].getText(), equalTo("This is a really cool <em>highlighter</em>."));
    assertThat(fragments[0].isHighlighted(), equalTo(true));
    assertThat(fragments[1].getText(), equalTo("Postings <em>highlighter</em> gives nice snippets back."));
    assertThat(fragments[1].isHighlighted(), equalTo(true));
    assertThat(fragments[2].getText(), equalTo("No matches here."));
    assertThat(fragments[2].isHighlighted(), equalTo(false));
}
Also used : DefaultEncoder(org.apache.lucene.search.highlight.DefaultEncoder) Snippet(org.apache.lucene.search.highlight.Snippet) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

DefaultEncoder (org.apache.lucene.search.highlight.DefaultEncoder)7 Document (org.apache.lucene.document.Document)5 Field (org.apache.lucene.document.Field)5 FieldType (org.apache.lucene.document.FieldType)5 TextField (org.apache.lucene.document.TextField)5 Snippet (org.apache.lucene.search.highlight.Snippet)5 Directory (org.apache.lucene.store.Directory)5 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)4 IndexReader (org.apache.lucene.index.IndexReader)4 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)3 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)3 IndexSearcher (org.apache.lucene.search.IndexSearcher)3 Query (org.apache.lucene.search.Query)3 TermQuery (org.apache.lucene.search.TermQuery)3 TopDocs (org.apache.lucene.search.TopDocs)3 StoredField (org.apache.lucene.document.StoredField)2 IndexWriter (org.apache.lucene.index.IndexWriter)2 Term (org.apache.lucene.index.Term)2 BooleanQuery (org.apache.lucene.search.BooleanQuery)2 Encoder (org.apache.lucene.search.highlight.Encoder)2