use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.
the class PostingsHighlighter method highlight.
@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
FieldMapper fieldMapper = highlighterContext.mapper;
SearchContextHighlight.Field field = highlighterContext.field;
if (canHighlight(fieldMapper) == false) {
throw new IllegalArgumentException("the field [" + highlighterContext.fieldName + "] should be indexed with positions and offsets in the postings list to be used with postings highlighter");
}
SearchContext context = highlighterContext.context;
FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
if (!hitContext.cache().containsKey(CACHE_KEY)) {
hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
}
HighlighterEntry highlighterEntry = (HighlighterEntry) hitContext.cache().get(CACHE_KEY);
MapperHighlighterEntry mapperHighlighterEntry = highlighterEntry.mappers.get(fieldMapper);
if (mapperHighlighterEntry == null) {
Encoder encoder = field.fieldOptions().encoder().equals("html") ? Encoders.HTML : Encoders.DEFAULT;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0], encoder);
mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter);
}
List<Snippet> snippets = new ArrayList<>();
int numberOfFragments;
try {
Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext);
CustomPostingsHighlighter highlighter;
if (field.fieldOptions().numberOfFragments() == 0) {
//we use a control char to separate values, which is the only char that the custom break iterator breaks the text on,
//so we don't lose the distinction between the different values of a field and we get back a snippet per value
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.NULL_SEPARATOR);
CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(HighlightUtils.NULL_SEPARATOR);
highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter, breakIterator, fieldValue, field.fieldOptions().noMatchSize() > 0);
//we are highlighting the whole content, one snippet per value
numberOfFragments = fieldValues.size();
} else {
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
String fieldValue = mergeFieldValues(fieldValues, HighlightUtils.PARAGRAPH_SEPARATOR);
highlighter = new CustomPostingsHighlighter(analyzer, mapperHighlighterEntry.passageFormatter, fieldValue, field.fieldOptions().noMatchSize() > 0);
numberOfFragments = field.fieldOptions().numberOfFragments();
}
IndexSearcher searcher = new IndexSearcher(hitContext.reader());
Snippet[] fieldSnippets = highlighter.highlightField(fieldMapper.fieldType().name(), highlighterContext.query, searcher, hitContext.docId(), numberOfFragments);
for (Snippet fieldSnippet : fieldSnippets) {
if (Strings.hasText(fieldSnippet.getText())) {
snippets.add(fieldSnippet);
}
}
} catch (IOException e) {
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
}
snippets = filterSnippets(snippets, field.fieldOptions().numberOfFragments());
if (field.fieldOptions().scoreOrdered()) {
//let's sort the snippets by score if needed
CollectionUtil.introSort(snippets, new Comparator<Snippet>() {
@Override
public int compare(Snippet o1, Snippet o2) {
return (int) Math.signum(o2.getScore() - o1.getScore());
}
});
}
String[] fragments = new String[snippets.size()];
for (int i = 0; i < fragments.length; i++) {
fragments[i] = snippets.get(i).getText();
}
if (fragments.length > 0) {
return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
}
return null;
}
use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.
the class PostingsHighlighter method filterSnippets.
static List<Snippet> filterSnippets(List<Snippet> snippets, int numberOfFragments) {
//We need to filter the snippets as due to no_match_size we could have
//either highlighted snippets or non highlighted ones and we don't want to mix those up
List<Snippet> filteredSnippets = new ArrayList<>(snippets.size());
for (Snippet snippet : snippets) {
if (snippet.isHighlighted()) {
filteredSnippets.add(snippet);
}
}
//otherwise we return the first non highlighted one if available
if (filteredSnippets.size() == 0) {
if (snippets.size() > 0) {
Snippet snippet = snippets.get(0);
//we need to return the first sentence of the content rather than the whole content
if (numberOfFragments == 0) {
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.ROOT);
String text = snippet.getText();
bi.setText(text);
int next = bi.next();
if (next != BreakIterator.DONE) {
String newText = text.substring(0, next).trim();
snippet = new Snippet(newText, snippet.getScore(), snippet.isHighlighted());
}
}
filteredSnippets.add(snippet);
}
}
return filteredSnippets;
}
use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.
the class UnifiedHighlighter method highlight.
@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
FieldMapper fieldMapper = highlighterContext.mapper;
SearchContextHighlight.Field field = highlighterContext.field;
SearchContext context = highlighterContext.context;
FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
if (!hitContext.cache().containsKey(CACHE_KEY)) {
hitContext.cache().put(CACHE_KEY, new HighlighterEntry());
}
HighlighterEntry highlighterEntry = (HighlighterEntry) hitContext.cache().get(CACHE_KEY);
MapperHighlighterEntry mapperHighlighterEntry = highlighterEntry.mappers.get(fieldMapper);
if (mapperHighlighterEntry == null) {
Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0], encoder);
mapperHighlighterEntry = new MapperHighlighterEntry(passageFormatter);
}
List<Snippet> snippets = new ArrayList<>();
int numberOfFragments;
try {
Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
List<Object> fieldValues = HighlightUtils.loadFieldValues(field, fieldMapper, context, hitContext);
fieldValues = fieldValues.stream().map(obj -> {
if (obj instanceof BytesRef) {
return fieldMapper.fieldType().valueForDisplay(obj).toString();
} else {
return obj;
}
}).collect(Collectors.toList());
IndexSearcher searcher = new IndexSearcher(hitContext.reader());
CustomUnifiedHighlighter highlighter;
if (field.fieldOptions().numberOfFragments() == 0) {
// we use a control char to separate values, which is the only char that the custom break iterator
// breaks the text on, so we don't lose the distinction between the different values of a field and we
// get back a snippet per value
String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator breakIterator = new org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
// we are highlighting the whole content, one snippet per value
numberOfFragments = fieldValues.size();
} else {
//using paragraph separator we make sure that each field value holds a discrete passage for highlighting
String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
BreakIterator bi = getBreakIterator(field);
highlighter = new CustomUnifiedHighlighter(searcher, analyzer, mapperHighlighterEntry.passageFormatter, field.fieldOptions().boundaryScannerLocale(), bi, fieldValue, field.fieldOptions().noMatchSize());
numberOfFragments = field.fieldOptions().numberOfFragments();
}
if (field.fieldOptions().requireFieldMatch()) {
final String fieldName = highlighterContext.fieldName;
highlighter.setFieldMatcher((name) -> fieldName.equals(name));
} else {
highlighter.setFieldMatcher((name) -> true);
}
Snippet[] fieldSnippets = highlighter.highlightField(highlighterContext.fieldName, highlighterContext.query, hitContext.docId(), numberOfFragments);
for (Snippet fieldSnippet : fieldSnippets) {
if (Strings.hasText(fieldSnippet.getText())) {
snippets.add(fieldSnippet);
}
}
} catch (IOException e) {
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
}
snippets = filterSnippets(snippets, field.fieldOptions().numberOfFragments());
if (field.fieldOptions().scoreOrdered()) {
//let's sort the snippets by score if needed
CollectionUtil.introSort(snippets, (o1, o2) -> Double.compare(o2.getScore(), o1.getScore()));
}
String[] fragments = new String[snippets.size()];
for (int i = 0; i < fragments.length; i++) {
fragments[i] = snippets.get(i).getText();
}
if (fragments.length > 0) {
return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
}
return null;
}
use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.
the class CustomPassageFormatterTests method testHtmlEncodeFormat.
public void testHtmlEncodeFormat() {
String content = "<b>This is a really cool highlighter.</b> Unified highlighter gives nice snippets back.";
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new SimpleHTMLEncoder());
Passage[] passages = new Passage[2];
String match = "highlighter";
BytesRef matchBytesRef = new BytesRef(match);
Passage passage1 = new Passage();
int start = content.indexOf(match);
int end = start + match.length();
passage1.setStartOffset(0);
//lets include the whitespace at the end to make sure we trim it
passage1.setEndOffset(end + 6);
passage1.addMatch(start, end, matchBytesRef);
passages[0] = passage1;
Passage passage2 = new Passage();
start = content.lastIndexOf(match);
end = start + match.length();
passage2.setStartOffset(passage1.getEndOffset());
passage2.setEndOffset(content.length());
passage2.addMatch(start, end, matchBytesRef);
passages[1] = passage2;
Snippet[] fragments = passageFormatter.format(passages, content);
assertThat(fragments, notNullValue());
assertThat(fragments.length, equalTo(2));
assertThat(fragments[0].getText(), equalTo("<b>This is a really cool <em>highlighter</em>.</b>"));
assertThat(fragments[1].getText(), equalTo("Unified <em>highlighter</em> gives nice snippets back."));
}
use of org.apache.lucene.search.highlight.Snippet in project elasticsearch by elastic.
the class CustomPassageFormatterTests method testSimpleFormat.
public void testSimpleFormat() {
String content = "This is a really cool highlighter. Unified highlighter gives nice snippets back. No matches here.";
CustomPassageFormatter passageFormatter = new CustomPassageFormatter("<em>", "</em>", new DefaultEncoder());
Passage[] passages = new Passage[3];
String match = "highlighter";
BytesRef matchBytesRef = new BytesRef(match);
Passage passage1 = new Passage();
int start = content.indexOf(match);
int end = start + match.length();
passage1.setStartOffset(0);
//lets include the whitespace at the end to make sure we trim it
passage1.setEndOffset(end + 2);
passage1.addMatch(start, end, matchBytesRef);
passages[0] = passage1;
Passage passage2 = new Passage();
start = content.lastIndexOf(match);
end = start + match.length();
passage2.setStartOffset(passage1.getEndOffset());
passage2.setEndOffset(end + 26);
passage2.addMatch(start, end, matchBytesRef);
passages[1] = passage2;
Passage passage3 = new Passage();
passage3.setStartOffset(passage2.getEndOffset());
passage3.setEndOffset(content.length());
passages[2] = passage3;
Snippet[] fragments = passageFormatter.format(passages, content);
assertThat(fragments, notNullValue());
assertThat(fragments.length, equalTo(3));
assertThat(fragments[0].getText(), equalTo("This is a really cool <em>highlighter</em>."));
assertThat(fragments[0].isHighlighted(), equalTo(true));
assertThat(fragments[1].getText(), equalTo("Unified <em>highlighter</em> gives nice snippets back."));
assertThat(fragments[1].isHighlighted(), equalTo(true));
assertThat(fragments[2].getText(), equalTo("No matches here."));
assertThat(fragments[2].isHighlighted(), equalTo(false));
}
Aggregations