use of org.apache.lucene.search.highlight.Highlighter in project lucene-solr by apache.
the class HighlightCustomQueryTest method highlightField.
/**
* This method intended for use with
* <tt>testHighlightingWithDefaultField()</tt>
*/
private String highlightField(Query query, String fieldName, String text) throws IOException, InvalidTokenOffsetsException {
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET).tokenStream(fieldName, text);
// Assuming "<B>", "</B>" used to highlight
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
MyQueryScorer scorer = new MyQueryScorer(query, fieldName, FIELD_NAME);
Highlighter highlighter = new Highlighter(formatter, scorer);
highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));
String rv = highlighter.getBestFragments(tokenStream, text, 1, "(FIELD TEXT TRUNCATED)");
return rv.length() == 0 ? text : rv;
}
use of org.apache.lucene.search.highlight.Highlighter in project lucene-solr by apache.
the class TermVectorReusingLeafReader method getPhraseHighlighter.
/**
* Return a phrase {@link org.apache.lucene.search.highlight.Highlighter} appropriate for this field.
* @param query The current Query
* @param fieldName The name of the field
* @param request The current SolrQueryRequest
* @param tokenStream document text tokenStream that implements reset() efficiently (e.g. CachingTokenFilter).
* If it's used, call reset() first.
* @throws IOException If there is a low-level I/O error.
*/
protected Highlighter getPhraseHighlighter(Query query, String fieldName, SolrQueryRequest request, TokenStream tokenStream) throws IOException {
SolrParams params = request.getParams();
Highlighter highlighter = new Highlighter(getFormatter(fieldName, params), getEncoder(fieldName, params), getSpanQueryScorer(query, fieldName, tokenStream, request));
highlighter.setTextFragmenter(getFragmenter(fieldName, params));
return highlighter;
}
use of org.apache.lucene.search.highlight.Highlighter in project lucene-solr by apache.
the class TermVectorReusingLeafReader method doHighlightingByHighlighter.
/** Highlights and returns the highlight object for this field -- a String[] by default. Null if none. */
@SuppressWarnings("unchecked")
protected Object doHighlightingByHighlighter(Document doc, int docId, SchemaField schemaField, Query query, IndexReader reader, SolrQueryRequest req) throws IOException {
final SolrParams params = req.getParams();
final String fieldName = schemaField.getName();
final int mvToExamine = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, (schemaField.multiValued()) ? Integer.MAX_VALUE : 1);
// Technically this is the max *fragments* (snippets), not max values:
int mvToMatch = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE);
if (mvToExamine <= 0 || mvToMatch <= 0) {
return null;
}
int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS);
if (maxCharsToAnalyze < 0) {
//e.g. -1
maxCharsToAnalyze = Integer.MAX_VALUE;
}
List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req);
if (fieldValues.isEmpty()) {
return null;
}
// preserve order of values in a multiValued list
boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);
int numFragments = getMaxSnippets(fieldName, params);
boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
List<TextFragment> frags = new ArrayList<>();
//Try term vectors, which is faster
// note: offsets are minimally sufficient for this HL.
final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null;
final TokenStream tvStream = TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1);
// We need to wrap in OffsetWindowTokenFilter if multi-valued
final OffsetWindowTokenFilter tvWindowStream;
if (tvStream != null && fieldValues.size() > 1) {
tvWindowStream = new OffsetWindowTokenFilter(tvStream);
} else {
tvWindowStream = null;
}
for (String thisText : fieldValues) {
if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) {
break;
}
TokenStream tstream;
if (tvWindowStream != null) {
// if we have a multi-valued field with term vectors, then get the next offset window
tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length());
} else if (tvStream != null) {
// single-valued with term vectors
tstream = tvStream;
} else {
// fall back to analyzer
tstream = createAnalyzerTStream(schemaField, thisText);
}
Highlighter highlighter;
if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
// We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream
// needs to implement reset() efficiently.
//If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary.
// It should be okay if OffsetLimit won't get applied in this case.
final TokenStream tempTokenStream;
if (tstream != tvStream) {
if (maxCharsToAnalyze >= thisText.length()) {
tempTokenStream = new CachingTokenFilter(tstream);
} else {
tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
}
} else {
tempTokenStream = tstream;
}
// get highlighter
highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream);
// if the CachingTokenFilter was consumed then use it going forward.
if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter) tempTokenStream).isCached()) {
tstream = tempTokenStream;
}
//tstream.reset(); not needed; getBestTextFragments will reset it.
} else {
// use "the old way"
highlighter = getHighlighter(query, fieldName, req);
}
highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
maxCharsToAnalyze -= thisText.length();
// Highlight!
try {
TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, thisText, mergeContiguousFragments, numFragments);
for (TextFragment bestTextFragment : bestTextFragments) {
if (//can happen via mergeContiguousFragments
bestTextFragment == null)
continue;
// normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless.
if (bestTextFragment.getScore() > 0 || preserveMulti) {
frags.add(bestTextFragment);
if (bestTextFragment.getScore() > 0)
// note: limits fragments (for multi-valued fields), not quite the number of values
--mvToMatch;
}
}
} catch (InvalidTokenOffsetsException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
}
// Put the fragments onto the Solr response (docSummaries)
if (frags.size() > 0) {
// sort such that the fragments with the highest score come first
if (!preserveMulti) {
Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore()));
}
// Truncate list to hl.snippets, but not when hl.preserveMulti
if (frags.size() > numFragments && !preserveMulti) {
frags = frags.subList(0, numFragments);
}
return getResponseForFragments(frags, req);
}
//no highlights for this field
return null;
}
use of org.apache.lucene.search.highlight.Highlighter in project OpenOLAT by OpenOLAT.
the class SearchResultsImpl method doHighlight.
/**
* Highlight (bold,color) query words in result-document. Set HighlightResult for content or description.
* @param query
* @param analyzer
* @param doc
* @param resultDocument
* @throws IOException
*/
private void doHighlight(Query query, Analyzer analyzer, Document doc, ResultDocument resultDocument) throws IOException {
Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(HIGHLIGHT_PRE_TAG, HIGHLIGHT_POST_TAG), new SimpleHTMLEncoder(), new QueryScorer(query));
// Get 3 best fragments of content and seperate with a "..."
try {
// highlight content
String content = doc.get(AbstractOlatDocument.CONTENT_FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(AbstractOlatDocument.CONTENT_FIELD_NAME, new StringReader(content));
String highlightResult = highlighter.getBestFragments(tokenStream, content, 3, HIGHLIGHT_SEPARATOR);
// if no highlightResult is in content => look in description
if (highlightResult.length() == 0) {
String description = doc.get(AbstractOlatDocument.DESCRIPTION_FIELD_NAME);
tokenStream = analyzer.tokenStream(AbstractOlatDocument.DESCRIPTION_FIELD_NAME, new StringReader(description));
highlightResult = highlighter.getBestFragments(tokenStream, description, 3, HIGHLIGHT_SEPARATOR);
resultDocument.setHighlightingDescription(true);
}
resultDocument.setHighlightResult(highlightResult);
// highlight title
String title = doc.get(AbstractOlatDocument.TITLE_FIELD_NAME);
title = title.trim();
if (title.length() > 128) {
title = FilterFactory.getHtmlTagAndDescapingFilter().filter(title);
title = Formatter.truncate(title, 128);
}
tokenStream = analyzer.tokenStream(AbstractOlatDocument.TITLE_FIELD_NAME, new StringReader(title));
String highlightTitle = highlighter.getBestFragments(tokenStream, title, 3, " ");
resultDocument.setHighlightTitle(highlightTitle);
} catch (InvalidTokenOffsetsException e) {
log.warn("", e);
}
}
use of org.apache.lucene.search.highlight.Highlighter in project SearchServices by Alfresco.
the class AlfrescoSolrHighlighter method doHighlightingByHighlighter.
/**
* Highlights and returns the highlight object for this field -- a String[] by default. Null if none.
*/
@SuppressWarnings("unchecked")
protected Object doHighlightingByHighlighter(Document doc, int docId, SchemaField schemaField, Query query, IndexReader reader, SolrQueryRequest req) throws IOException {
final SolrParams params = req.getParams();
final String fieldName = schemaField.getName();
final int mvToExamine = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, (schemaField.multiValued()) ? Integer.MAX_VALUE : 1);
// Technically this is the max *fragments* (snippets), not max values:
int mvToMatch = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE);
if (mvToExamine <= 0 || mvToMatch <= 0) {
return null;
}
int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS);
if (maxCharsToAnalyze < 0) {
// e.g. -1
maxCharsToAnalyze = Integer.MAX_VALUE;
}
List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req);
if (fieldValues.isEmpty()) {
return null;
}
// preserve order of values in a multiValued list
boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);
int numFragments = getMaxSnippets(fieldName, params);
boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
List<TextFragment> frags = new ArrayList<>();
// Try term vectors, which is faster
// note: offsets are minimally sufficient for this HL.
final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null;
final TokenStream tvStream = TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1);
// We need to wrap in OffsetWindowTokenFilter if multi-valued
final OffsetWindowTokenFilter tvWindowStream;
if (tvStream != null && fieldValues.size() > 1) {
tvWindowStream = new OffsetWindowTokenFilter(tvStream);
} else {
tvWindowStream = null;
}
for (String thisText : fieldValues) {
if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) {
break;
}
TokenStream tstream;
if (tvWindowStream != null) {
// if we have a multi-valued field with term vectors, then get the next offset window
tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length());
} else if (tvStream != null) {
// single-valued with term vectors
tstream = tvStream;
} else {
// fall back to analyzer
tstream = createAnalyzerTStream(schemaField, thisText);
}
Highlighter highlighter;
if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
// We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream
// needs to implement reset() efficiently.
// If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary.
// It should be okay if OffsetLimit won't get applied in this case.
final TokenStream tempTokenStream;
if (tstream != tvStream) {
if (maxCharsToAnalyze >= thisText.length()) {
tempTokenStream = new CachingTokenFilter(tstream);
} else {
tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
}
} else {
tempTokenStream = tstream;
}
// get highlighter
highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream);
// if the CachingTokenFilter was consumed then use it going forward.
if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter) tempTokenStream).isCached()) {
tstream = tempTokenStream;
}
// tstream.reset(); not needed; getBestTextFragments will reset it.
} else {
// use "the old way"
highlighter = getHighlighter(query, fieldName, req);
}
highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
maxCharsToAnalyze -= thisText.length();
// Highlight!
try {
TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, fixLocalisedText(thisText), mergeContiguousFragments, numFragments);
for (TextFragment bestTextFragment : bestTextFragments) {
if (// can happen via mergeContiguousFragments
bestTextFragment == null)
continue;
// normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless.
if (bestTextFragment.getScore() > 0 || preserveMulti) {
frags.add(bestTextFragment);
if (bestTextFragment.getScore() > 0)
// note: limits fragments (for multi-valued fields), not quite the number of values
--mvToMatch;
}
}
} catch (InvalidTokenOffsetsException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
}
// Put the fragments onto the Solr response (docSummaries)
if (frags.size() > 0) {
// sort such that the fragments with the highest score come first
if (!preserveMulti) {
Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore()));
}
// Truncate list to hl.snippets, but not when hl.preserveMulti
if (frags.size() > numFragments && !preserveMulti) {
frags = frags.subList(0, numFragments);
}
return getResponseForFragments(frags, req);
}
// no highlights for this field
return null;
}
Aggregations