use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project elasticsearch by elastic.
the class PlainHighlighter method highlight.
@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
SearchContextHighlight.Field field = highlighterContext.field;
SearchContext context = highlighterContext.context;
FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
FieldMapper mapper = highlighterContext.mapper;
Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
if (!hitContext.cache().containsKey(CACHE_KEY)) {
Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> mappers = new HashMap<>();
hitContext.cache().put(CACHE_KEY, mappers);
}
@SuppressWarnings("unchecked") Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> cache = (Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter>) hitContext.cache().get(CACHE_KEY);
org.apache.lucene.search.highlight.Highlighter entry = cache.get(mapper);
if (entry == null) {
QueryScorer queryScorer = new CustomQueryScorer(highlighterContext.query, field.fieldOptions().requireFieldMatch() ? mapper.fieldType().name() : null);
queryScorer.setExpandMultiTermQuery(true);
Fragmenter fragmenter;
if (field.fieldOptions().numberOfFragments() == 0) {
fragmenter = new NullFragmenter();
} else if (field.fieldOptions().fragmenter() == null) {
fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize());
} else if ("simple".equals(field.fieldOptions().fragmenter())) {
fragmenter = new SimpleFragmenter(field.fieldOptions().fragmentCharSize());
} else if ("span".equals(field.fieldOptions().fragmenter())) {
fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize());
} else {
throw new IllegalArgumentException("unknown fragmenter option [" + field.fieldOptions().fragmenter() + "] for the field [" + highlighterContext.fieldName + "]");
}
Formatter formatter = new SimpleHTMLFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0]);
entry = new org.apache.lucene.search.highlight.Highlighter(formatter, encoder, queryScorer);
entry.setTextFragmenter(fragmenter);
// always highlight across all data
entry.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
cache.put(mapper, entry);
}
// a HACK to make highlighter do highlighting, even though its using the single frag list builder
int numberOfFragments = field.fieldOptions().numberOfFragments() == 0 ? 1 : field.fieldOptions().numberOfFragments();
ArrayList<TextFragment> fragsList = new ArrayList<>();
List<Object> textsToHighlight;
Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
try {
textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext);
for (Object textToHighlight : textsToHighlight) {
String text;
if (textToHighlight instanceof BytesRef) {
text = mapper.fieldType().valueForDisplay(textToHighlight).toString();
} else {
text = textToHighlight.toString();
}
try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {
if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) {
// can't perform highlighting if the stream has no terms (binary token stream) or no offsets
continue;
}
TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false, numberOfFragments);
for (TextFragment bestTextFragment : bestTextFragments) {
if (bestTextFragment != null && bestTextFragment.getScore() > 0) {
fragsList.add(bestTextFragment);
}
}
}
}
} catch (Exception e) {
if (ExceptionsHelper.unwrap(e, BytesRefHash.MaxBytesLengthExceededException.class) != null) {
// the plain highlighter will parse the source and try to analyze it.
return null;
} else {
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
}
}
if (field.fieldOptions().scoreOrdered()) {
CollectionUtil.introSort(fragsList, new Comparator<TextFragment>() {
@Override
public int compare(TextFragment o1, TextFragment o2) {
return Math.round(o2.getScore() - o1.getScore());
}
});
}
String[] fragments;
// number_of_fragments is set to 0 but we have a multivalued field
if (field.fieldOptions().numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) {
fragments = new String[fragsList.size()];
for (int i = 0; i < fragsList.size(); i++) {
fragments[i] = fragsList.get(i).toString();
}
} else {
// refine numberOfFragments if needed
numberOfFragments = fragsList.size() < numberOfFragments ? fragsList.size() : numberOfFragments;
fragments = new String[numberOfFragments];
for (int i = 0; i < fragments.length; i++) {
fragments[i] = fragsList.get(i).toString();
}
}
if (fragments.length > 0) {
return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
}
int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize();
if (noMatchSize > 0 && textsToHighlight.size() > 0) {
// Pull an excerpt from the beginning of the string but make sure to split the string on a term boundary.
String fieldContents = textsToHighlight.get(0).toString();
int end;
try {
end = findGoodEndForNoHighlightExcerpt(noMatchSize, analyzer, mapper.fieldType().name(), fieldContents);
} catch (Exception e) {
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
}
if (end > 0) {
return new HighlightField(highlighterContext.fieldName, new Text[] { new Text(fieldContents.substring(0, end)) });
}
}
return null;
}
use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project lucene-solr by apache.
the class HighlightCustomQueryTest method highlightField.
/**
* This method intended for use with
* <tt>testHighlightingWithDefaultField()</tt>
*/
private String highlightField(Query query, String fieldName, String text) throws IOException, InvalidTokenOffsetsException {
TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET).tokenStream(fieldName, text);
// Assuming "<B>", "</B>" used to highlight
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
MyQueryScorer scorer = new MyQueryScorer(query, fieldName, FIELD_NAME);
Highlighter highlighter = new Highlighter(formatter, scorer);
highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));
String rv = highlighter.getBestFragments(tokenStream, text, 1, "(FIELD TEXT TRUNCATED)");
return rv.length() == 0 ? text : rv;
}
use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project OpenOLAT by OpenOLAT.
the class SearchResultsImpl method doHighlight.
/**
* Highlight (bold,color) query words in result-document. Set HighlightResult for content or description.
* @param query
* @param analyzer
* @param doc
* @param resultDocument
* @throws IOException
*/
private void doHighlight(Query query, Analyzer analyzer, Document doc, ResultDocument resultDocument) throws IOException {
Highlighter highlighter = new Highlighter(new SimpleHTMLFormatter(HIGHLIGHT_PRE_TAG, HIGHLIGHT_POST_TAG), new SimpleHTMLEncoder(), new QueryScorer(query));
// Get 3 best fragments of content and seperate with a "..."
try {
// highlight content
String content = doc.get(AbstractOlatDocument.CONTENT_FIELD_NAME);
TokenStream tokenStream = analyzer.tokenStream(AbstractOlatDocument.CONTENT_FIELD_NAME, new StringReader(content));
String highlightResult = highlighter.getBestFragments(tokenStream, content, 3, HIGHLIGHT_SEPARATOR);
// if no highlightResult is in content => look in description
if (highlightResult.length() == 0) {
String description = doc.get(AbstractOlatDocument.DESCRIPTION_FIELD_NAME);
tokenStream = analyzer.tokenStream(AbstractOlatDocument.DESCRIPTION_FIELD_NAME, new StringReader(description));
highlightResult = highlighter.getBestFragments(tokenStream, description, 3, HIGHLIGHT_SEPARATOR);
resultDocument.setHighlightingDescription(true);
}
resultDocument.setHighlightResult(highlightResult);
// highlight title
String title = doc.get(AbstractOlatDocument.TITLE_FIELD_NAME);
title = title.trim();
if (title.length() > 128) {
title = FilterFactory.getHtmlTagAndDescapingFilter().filter(title);
title = Formatter.truncate(title, 128);
}
tokenStream = analyzer.tokenStream(AbstractOlatDocument.TITLE_FIELD_NAME, new StringReader(title));
String highlightTitle = highlighter.getBestFragments(tokenStream, title, 3, " ");
resultDocument.setHighlightTitle(highlightTitle);
} catch (InvalidTokenOffsetsException e) {
log.warn("", e);
}
}
use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project ansj_seg by NLPchina.
the class HeightLightTest method toHighlighter.
/**
* 高亮设置
*
* @param query
* @param doc
* @return
*/
private static String toHighlighter(Analyzer analyzer, Query query, Document doc) {
String field = "text";
try {
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query));
TokenStream tokenStream1 = indexAnalyzer.tokenStream("text", new StringReader(doc.get(field)));
String highlighterStr = highlighter.getBestFragment(tokenStream1, doc.get(field));
return highlighterStr == null ? doc.get(field) : highlighterStr;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvalidTokenOffsetsException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
use of org.apache.lucene.search.highlight.SimpleHTMLFormatter in project ansj_seg by NLPchina.
the class IndexAndTest method toHighlighter.
/**
* 高亮设置
*
* @param query
* @param doc
* @param field
* @return
*/
private String toHighlighter(Analyzer analyzer, Query query, Document doc) {
String field = "text";
try {
SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<font color=\"red\">", "</font>");
Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query));
TokenStream tokenStream1 = analyzer.tokenStream("text", new StringReader(doc.get(field)));
String highlighterStr = highlighter.getBestFragment(tokenStream1, doc.get(field));
return highlighterStr == null ? doc.get(field) : highlighterStr;
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InvalidTokenOffsetsException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
Aggregations