use of org.apache.lucene.search.highlight.DefaultEncoder in project elasticsearch by elastic.
the class CustomPassageFormatterTests method testSimpleFormat.
public void testSimpleFormat() {
String content = "This is a really cool highlighter. Unified highlighter gives nice snippets back. No matches here.";
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new DefaultEncoder());
Passage[] passages = new Passage[3];
String match = "highlighter";
BytesRef matchBytesRef = new BytesRef(match);
Passage passage1 = new Passage();
int start = content.indexOf(match);
int end = start + match.length();
passage1.setStartOffset(0);
//lets include the whitespace at the end to make sure we trim it
passage1.setEndOffset(end + 2);
passage1.addMatch(start, end, matchBytesRef);
passages[0] = passage1;
Passage passage2 = new Passage();
start = content.lastIndexOf(match);
end = start + match.length();
passage2.setStartOffset(passage1.getEndOffset());
passage2.setEndOffset(end + 26);
passage2.addMatch(start, end, matchBytesRef);
passages[1] = passage2;
Passage passage3 = new Passage();
passage3.setStartOffset(passage2.getEndOffset());
passage3.setEndOffset(content.length());
passages[2] = passage3;
Snippet[] fragments = passageFormatter.format(passages, content);
assertThat(fragments, notNullValue());
assertThat(fragments.length, equalTo(3));
assertThat(fragments[0].getText(), equalTo("This is a really cool <em>highlighter</em>."));
assertThat(fragments[0].isHighlighted(), equalTo(true));
assertThat(fragments[1].getText(), equalTo("Unified <em>highlighter</em> gives nice snippets back."));
assertThat(fragments[1].isHighlighted(), equalTo(true));
assertThat(fragments[2].getText(), equalTo("No matches here."));
assertThat(fragments[2].isHighlighted(), equalTo(false));
}
use of org.apache.lucene.search.highlight.DefaultEncoder in project elasticsearch by elastic.
the class CustomUnifiedHighlighterTests method assertHighlightOneDoc.
private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer analyzer, Query query, Locale locale, BreakIterator breakIterator, int noMatchSize, String[] expectedPassages) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(analyzer);
iwc.setMergePolicy(newTieredMergePolicy(random()));
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
ft.freeze();
Document doc = new Document();
for (String input : inputs) {
Field field = new Field(fieldName, "", ft);
field.setStringValue(input);
doc.add(field);
}
iw.addDocument(doc);
DirectoryReader reader = iw.getReader();
IndexSearcher searcher = newSearcher(reader);
iw.close();
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), 1, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale, breakIterator, rawValue, noMatchSize);
highlighter.setFieldMatcher((name) -> "text".equals(name));
final Snippet[] snippets = highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);
assertEquals(snippets.length, expectedPassages.length);
for (int i = 0; i < snippets.length; i++) {
assertEquals(snippets[i].getText(), expectedPassages[i]);
}
reader.close();
dir.close();
}
use of org.apache.lucene.search.highlight.DefaultEncoder in project elasticsearch by elastic.
the class CustomPostingsHighlighterTests method testCustomPostingsHighlighter.
public void testCustomPostingsHighlighter() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = newIndexWriterConfig(new MockAnalyzer(random()));
iwc.setMergePolicy(newLogMergePolicy());
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);
FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
//good position but only one match
final String firstValue = "This is a test. Just a test1 highlighting from postings highlighter.";
Field body = new Field("body", "", offsetsType);
Document doc = new Document();
doc.add(body);
body.setStringValue(firstValue);
//two matches, not the best snippet due to its length though
final String secondValue = "This is the second highlighting value to perform highlighting on a longer text that gets scored lower.";
Field body2 = new Field("body", "", offsetsType);
doc.add(body2);
body2.setStringValue(secondValue);
//two matches and short, will be scored highest
final String thirdValue = "This is highlighting the third short highlighting value.";
Field body3 = new Field("body", "", offsetsType);
doc.add(body3);
body3.setStringValue(thirdValue);
//one match, same as first but at the end, will be scored lower due to its position
final String fourthValue = "Just a test4 highlighting from postings highlighter.";
Field body4 = new Field("body", "", offsetsType);
doc.add(body4);
body4.setStringValue(fourthValue);
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
String firstHlValue = "Just a test1 <b>highlighting</b> from postings highlighter.";
String secondHlValue = "This is the second <b>highlighting</b> value to perform <b>highlighting</b> on a longer text that gets scored lower.";
String thirdHlValue = "This is <b>highlighting</b> the third short <b>highlighting</b> value.";
String fourthHlValue = "Just a test4 <b>highlighting</b> from postings highlighter.";
IndexSearcher searcher = newSearcher(ir);
Query query = new TermQuery(new Term("body", "highlighting"));
TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
assertThat(topDocs.totalHits, equalTo(1));
int docId = topDocs.scoreDocs[0].doc;
String fieldValue = firstValue + HighlightUtils.PARAGRAPH_SEPARATOR + secondValue + HighlightUtils.PARAGRAPH_SEPARATOR + thirdValue + HighlightUtils.PARAGRAPH_SEPARATOR + fourthValue;
CustomPostingsHighlighter highlighter = new CustomPostingsHighlighter(null, new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), fieldValue, false);
Snippet[] snippets = highlighter.highlightField("body", query, searcher, docId, 5);
assertThat(snippets.length, equalTo(4));
assertThat(snippets[0].getText(), equalTo(firstHlValue));
assertThat(snippets[1].getText(), equalTo(secondHlValue));
assertThat(snippets[2].getText(), equalTo(thirdHlValue));
assertThat(snippets[3].getText(), equalTo(fourthHlValue));
ir.close();
dir.close();
}
use of org.apache.lucene.search.highlight.DefaultEncoder in project lucene-solr by apache.
the class FastVectorHighlighterTest method matchedFieldsTestCase.
private void matchedFieldsTestCase(boolean useMatchedFields, boolean fieldMatch, String fieldValue, String expected, Query... queryClauses) throws IOException {
Document doc = new Document();
FieldType stored = new FieldType(TextField.TYPE_STORED);
stored.setStoreTermVectorOffsets(true);
stored.setStoreTermVectorPositions(true);
stored.setStoreTermVectors(true);
stored.freeze();
FieldType matched = new FieldType(TextField.TYPE_NOT_STORED);
matched.setStoreTermVectorOffsets(true);
matched.setStoreTermVectorPositions(true);
matched.setStoreTermVectors(true);
matched.freeze();
// Whitespace tokenized with English stop words
doc.add(new Field("field", fieldValue, stored));
// Whitespace tokenized without stop words
doc.add(new Field("field_exact", fieldValue, matched));
// Whitespace tokenized without toLower
doc.add(new Field("field_super_exact", fieldValue, matched));
// Each letter is a token
doc.add(new Field("field_characters", fieldValue, matched));
// Every three letters is a token
doc.add(new Field("field_tripples", fieldValue, matched));
doc.add(new Field("field_sliced", // Sliced at 10 chars then analyzed just like field
fieldValue.substring(// Sliced at 10 chars then analyzed just like field
0, Math.min(fieldValue.length() - 1, 10)), matched));
doc.add(new Field("field_der_red", new // Hacky field containing "der" and "red" at pos = 0
CannedTokenStream(token("der", 1, 0, 3), token("red", 0, 0, 3)), matched));
final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>();
fieldAnalyzers.put("field", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET));
fieldAnalyzers.put("field_exact", new MockAnalyzer(random()));
fieldAnalyzers.put("field_super_exact", new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false));
fieldAnalyzers.put("field_characters", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp(".").toAutomaton()), true));
fieldAnalyzers.put("field_tripples", new MockAnalyzer(random(), new CharacterRunAutomaton(new RegExp("...").toAutomaton()), true));
fieldAnalyzers.put("field_sliced", fieldAnalyzers.get("field"));
// This is required even though we provide a token stream
fieldAnalyzers.put("field_der_red", fieldAnalyzers.get("field"));
Analyzer analyzer = new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) {
public Analyzer getWrappedAnalyzer(String fieldName) {
return fieldAnalyzers.get(fieldName);
}
};
Directory dir = newDirectory();
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
writer.addDocument(doc);
FastVectorHighlighter highlighter = new FastVectorHighlighter();
FragListBuilder fragListBuilder = new SimpleFragListBuilder();
FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder();
IndexReader reader = DirectoryReader.open(writer);
String[] preTags = new String[] { "<b>" };
String[] postTags = new String[] { "</b>" };
Encoder encoder = new DefaultEncoder();
int docId = 0;
BooleanQuery.Builder query = new BooleanQuery.Builder();
for (Query clause : queryClauses) {
query.add(clause, Occur.MUST);
}
FieldQuery fieldQuery = new FieldQuery(query.build(), reader, true, fieldMatch);
String[] bestFragments;
if (useMatchedFields) {
Set<String> matchedFields = new HashSet<>();
matchedFields.add("field");
matchedFields.add("field_exact");
matchedFields.add("field_super_exact");
matchedFields.add("field_characters");
matchedFields.add("field_tripples");
matchedFields.add("field_sliced");
matchedFields.add("field_der_red");
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", matchedFields, 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
} else {
bestFragments = highlighter.getBestFragments(fieldQuery, reader, docId, "field", 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder);
}
assertEquals(expected, bestFragments[0]);
reader.close();
writer.close();
dir.close();
}
use of org.apache.lucene.search.highlight.DefaultEncoder in project elasticsearch by elastic.
the class CustomPassageFormatterTests method testSimpleFormat.
public void testSimpleFormat() {
String content = "This is a really cool highlighter. Postings highlighter gives nice snippets back. No matches here.";
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new DefaultEncoder());
Passage[] passages = new Passage[3];
String match = "highlighter";
BytesRef matchBytesRef = new BytesRef(match);
Passage passage1 = new Passage();
int start = content.indexOf(match);
int end = start + match.length();
passage1.startOffset = 0;
//lets include the whitespace at the end to make sure we trim it
passage1.endOffset = end + 2;
passage1.addMatch(start, end, matchBytesRef);
passages[0] = passage1;
Passage passage2 = new Passage();
start = content.lastIndexOf(match);
end = start + match.length();
passage2.startOffset = passage1.endOffset;
passage2.endOffset = end + 26;
passage2.addMatch(start, end, matchBytesRef);
passages[1] = passage2;
Passage passage3 = new Passage();
passage3.startOffset = passage2.endOffset;
passage3.endOffset = content.length();
passages[2] = passage3;
Snippet[] fragments = passageFormatter.format(passages, content);
assertThat(fragments, notNullValue());
assertThat(fragments.length, equalTo(3));
assertThat(fragments[0].getText(), equalTo("This is a really cool <em>highlighter</em>."));
assertThat(fragments[0].isHighlighted(), equalTo(true));
assertThat(fragments[1].getText(), equalTo("Postings <em>highlighter</em> gives nice snippets back."));
assertThat(fragments[1].isHighlighted(), equalTo(true));
assertThat(fragments[2].getText(), equalTo("No matches here."));
assertThat(fragments[2].isHighlighted(), equalTo(false));
}
Aggregations