use of org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator in project OpenSearch by opensearch-project.
the class AnnotatedTextHighlighterTests method testAnnotatedTextStructuredMatch.
public void testAnnotatedTextStructuredMatch() throws Exception {
// Check that a structured token eg a URL can be highlighted in a query
// on marked-up
// content using an "annotated_text" type field.
String url = "https://en.wikipedia.org/wiki/Key_Word_in_Context";
String encodedUrl = URLEncoder.encode(url, "UTF-8");
String annotatedWord = "[highlighting](" + encodedUrl + ")";
String highlightedAnnotatedWord = "[highlighting](" + AnnotatedPassageFormatter.SEARCH_HIT_TYPE + "=" + encodedUrl + "&" + encodedUrl + ")";
final String[] markedUpInputs = { "This is a test. Just a test1 " + annotatedWord + " from [annotated](bar) highlighter.", "This is the second " + annotatedWord + " value to perform highlighting on a longer text that gets scored lower." };
String[] expectedPassages = { "This is a test. Just a test1 " + highlightedAnnotatedWord + " from [annotated](bar) highlighter.", "This is the second " + highlightedAnnotatedWord + " value to perform highlighting on a" + " longer text that gets scored lower." };
Query query = new TermQuery(new Term("text", url));
BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
}
use of org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator in project OpenSearch by opensearch-project.
the class AnnotatedTextHighlighterTests method testAnnotatedTextSingleFieldWithBreakIterator.
public void testAnnotatedTextSingleFieldWithBreakIterator() throws Exception {
final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore. Kim shook hands with Donald" };
String[] expectedPassages = { "[Donald](_hit_term=donald) Trump visited Singapore", "Kim shook hands with [Donald](_hit_term=donald)" };
Query query = new TermQuery(new Term("text", "donald"));
BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
breakIterator = new SplittingBreakIterator(breakIterator, '.');
assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
}
use of org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator in project OpenSearch by opensearch-project.
the class UnifiedHighlighter method buildHighlighter.
CustomUnifiedHighlighter buildHighlighter(FieldHighlightContext fieldContext) throws IOException {
Encoder encoder = fieldContext.field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
int maxAnalyzedOffset = fieldContext.context.getIndexSettings().getHighlightMaxAnalyzedOffset();
int keywordIgnoreAbove = Integer.MAX_VALUE;
if (fieldContext.fieldType instanceof KeywordFieldMapper.KeywordFieldType) {
KeywordFieldMapper mapper = (KeywordFieldMapper) fieldContext.context.mapperService().documentMapper().mappers().getMapper(fieldContext.fieldName);
keywordIgnoreAbove = mapper.ignoreAbove();
}
int numberOfFragments = fieldContext.field.fieldOptions().numberOfFragments();
Analyzer analyzer = getAnalyzer(fieldContext.context.mapperService().documentMapper());
PassageFormatter passageFormatter = getPassageFormatter(fieldContext.hitContext, fieldContext.field, encoder);
IndexSearcher searcher = fieldContext.context.searcher();
OffsetSource offsetSource = getOffsetSource(fieldContext.fieldType);
BreakIterator breakIterator;
int higlighterNumberOfFragments;
if (numberOfFragments == 0 || // non-tokenized fields should not use any break iterator (ignore boundaryScannerType)
fieldContext.fieldType.getTextSearchInfo().isTokenized() == false) {
/*
* We use a control char to separate values, which is the
* only char that the custom break iterator breaks the text
* on, so we don't lose the distinction between the different
* values of a field and we get back a snippet per value
*/
breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
higlighterNumberOfFragments = numberOfFragments == 0 ? Integer.MAX_VALUE - 1 : numberOfFragments;
} else {
// using paragraph separator we make sure that each field value holds a discrete passage for highlighting
breakIterator = getBreakIterator(fieldContext.field);
higlighterNumberOfFragments = numberOfFragments;
}
return new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter, fieldContext.field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldContext.context.getIndexName(), fieldContext.fieldName, fieldContext.query, fieldContext.field.fieldOptions().noMatchSize(), higlighterNumberOfFragments, fieldMatcher(fieldContext), keywordIgnoreAbove, maxAnalyzedOffset);
}
use of org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator in project OpenSearch by opensearch-project.
the class AnnotatedTextHighlighterTests method testBadAnnotation.
public void testBadAnnotation() throws Exception {
final String[] markedUpInputs = { "Missing bracket for [Donald Trump](Donald+Trump visited Singapore" };
String[] expectedPassages = { "Missing bracket for [Donald Trump](Donald+Trump visited [Singapore](_hit_term=singapore)" };
Query query = new TermQuery(new Term("text", "singapore"));
BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
}
use of org.apache.lucene.search.uhighlight.CustomSeparatorBreakIterator in project OpenSearch by opensearch-project.
the class AnnotatedTextHighlighterTests method testAnnotatedTextMultiFieldWithBreakIterator.
public void testAnnotatedTextMultiFieldWithBreakIterator() throws Exception {
final String[] markedUpInputs = { "[Donald Trump](Donald+Trump) visited Singapore. Kim shook hands with Donald", "Donald duck is a [Disney](Disney+Inc) invention" };
String[] expectedPassages = { "[Donald](_hit_term=donald) Trump visited Singapore", "Kim shook hands with [Donald](_hit_term=donald)", "[Donald](_hit_term=donald) duck is a [Disney](Disney+Inc) invention" };
Query query = new TermQuery(new Term("text", "donald"));
BreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
breakIterator = new SplittingBreakIterator(breakIterator, '.');
assertHighlightOneDoc("text", markedUpInputs, query, Locale.ROOT, breakIterator, 0, expectedPassages);
}
Aggregations