use of org.apache.lucene.search.highlight.TextFragment in project elasticsearch by elastic.
the class PlainHighlighter method highlight.
@Override
public HighlightField highlight(HighlighterContext highlighterContext) {
SearchContextHighlight.Field field = highlighterContext.field;
SearchContext context = highlighterContext.context;
FetchSubPhase.HitContext hitContext = highlighterContext.hitContext;
FieldMapper mapper = highlighterContext.mapper;
Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
if (!hitContext.cache().containsKey(CACHE_KEY)) {
Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> mappers = new HashMap<>();
hitContext.cache().put(CACHE_KEY, mappers);
}
@SuppressWarnings("unchecked") Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter> cache = (Map<FieldMapper, org.apache.lucene.search.highlight.Highlighter>) hitContext.cache().get(CACHE_KEY);
org.apache.lucene.search.highlight.Highlighter entry = cache.get(mapper);
if (entry == null) {
QueryScorer queryScorer = new CustomQueryScorer(highlighterContext.query, field.fieldOptions().requireFieldMatch() ? mapper.fieldType().name() : null);
queryScorer.setExpandMultiTermQuery(true);
Fragmenter fragmenter;
if (field.fieldOptions().numberOfFragments() == 0) {
fragmenter = new NullFragmenter();
} else if (field.fieldOptions().fragmenter() == null) {
fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize());
} else if ("simple".equals(field.fieldOptions().fragmenter())) {
fragmenter = new SimpleFragmenter(field.fieldOptions().fragmentCharSize());
} else if ("span".equals(field.fieldOptions().fragmenter())) {
fragmenter = new SimpleSpanFragmenter(queryScorer, field.fieldOptions().fragmentCharSize());
} else {
throw new IllegalArgumentException("unknown fragmenter option [" + field.fieldOptions().fragmenter() + "] for the field [" + highlighterContext.fieldName + "]");
}
Formatter formatter = new SimpleHTMLFormatter(field.fieldOptions().preTags()[0], field.fieldOptions().postTags()[0]);
entry = new org.apache.lucene.search.highlight.Highlighter(formatter, encoder, queryScorer);
entry.setTextFragmenter(fragmenter);
// always highlight across all data
entry.setMaxDocCharsToAnalyze(Integer.MAX_VALUE);
cache.put(mapper, entry);
}
// a HACK to make highlighter do highlighting, even though its using the single frag list builder
int numberOfFragments = field.fieldOptions().numberOfFragments() == 0 ? 1 : field.fieldOptions().numberOfFragments();
ArrayList<TextFragment> fragsList = new ArrayList<>();
List<Object> textsToHighlight;
Analyzer analyzer = context.mapperService().documentMapper(hitContext.hit().getType()).mappers().indexAnalyzer();
try {
textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext);
for (Object textToHighlight : textsToHighlight) {
String text;
if (textToHighlight instanceof BytesRef) {
text = mapper.fieldType().valueForDisplay(textToHighlight).toString();
} else {
text = textToHighlight.toString();
}
try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {
if (!tokenStream.hasAttribute(CharTermAttribute.class) || !tokenStream.hasAttribute(OffsetAttribute.class)) {
// can't perform highlighting if the stream has no terms (binary token stream) or no offsets
continue;
}
TextFragment[] bestTextFragments = entry.getBestTextFragments(tokenStream, text, false, numberOfFragments);
for (TextFragment bestTextFragment : bestTextFragments) {
if (bestTextFragment != null && bestTextFragment.getScore() > 0) {
fragsList.add(bestTextFragment);
}
}
}
}
} catch (Exception e) {
if (ExceptionsHelper.unwrap(e, BytesRefHash.MaxBytesLengthExceededException.class) != null) {
// the plain highlighter will parse the source and try to analyze it.
return null;
} else {
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
}
}
if (field.fieldOptions().scoreOrdered()) {
CollectionUtil.introSort(fragsList, new Comparator<TextFragment>() {
@Override
public int compare(TextFragment o1, TextFragment o2) {
return Math.round(o2.getScore() - o1.getScore());
}
});
}
String[] fragments;
// number_of_fragments is set to 0 but we have a multivalued field
if (field.fieldOptions().numberOfFragments() == 0 && textsToHighlight.size() > 1 && fragsList.size() > 0) {
fragments = new String[fragsList.size()];
for (int i = 0; i < fragsList.size(); i++) {
fragments[i] = fragsList.get(i).toString();
}
} else {
// refine numberOfFragments if needed
numberOfFragments = fragsList.size() < numberOfFragments ? fragsList.size() : numberOfFragments;
fragments = new String[numberOfFragments];
for (int i = 0; i < fragments.length; i++) {
fragments[i] = fragsList.get(i).toString();
}
}
if (fragments.length > 0) {
return new HighlightField(highlighterContext.fieldName, Text.convertFromStringArray(fragments));
}
int noMatchSize = highlighterContext.field.fieldOptions().noMatchSize();
if (noMatchSize > 0 && textsToHighlight.size() > 0) {
// Pull an excerpt from the beginning of the string but make sure to split the string on a term boundary.
String fieldContents = textsToHighlight.get(0).toString();
int end;
try {
end = findGoodEndForNoHighlightExcerpt(noMatchSize, analyzer, mapper.fieldType().name(), fieldContents);
} catch (Exception e) {
throw new FetchPhaseExecutionException(context, "Failed to highlight field [" + highlighterContext.fieldName + "]", e);
}
if (end > 0) {
return new HighlightField(highlighterContext.fieldName, new Text[] { new Text(fieldContents.substring(0, end)) });
}
}
return null;
}
use of org.apache.lucene.search.highlight.TextFragment in project lucene-solr by apache.
the class TermVectorReusingLeafReader method doHighlightingByHighlighter.
/** Highlights and returns the highlight object for this field -- a String[] by default. Null if none. */
@SuppressWarnings("unchecked")
protected Object doHighlightingByHighlighter(Document doc, int docId, SchemaField schemaField, Query query, IndexReader reader, SolrQueryRequest req) throws IOException {
final SolrParams params = req.getParams();
final String fieldName = schemaField.getName();
final int mvToExamine = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, (schemaField.multiValued()) ? Integer.MAX_VALUE : 1);
// Technically this is the max *fragments* (snippets), not max values:
int mvToMatch = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE);
if (mvToExamine <= 0 || mvToMatch <= 0) {
return null;
}
int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS);
if (maxCharsToAnalyze < 0) {
//e.g. -1
maxCharsToAnalyze = Integer.MAX_VALUE;
}
List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req);
if (fieldValues.isEmpty()) {
return null;
}
// preserve order of values in a multiValued list
boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);
int numFragments = getMaxSnippets(fieldName, params);
boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
List<TextFragment> frags = new ArrayList<>();
//Try term vectors, which is faster
// note: offsets are minimally sufficient for this HL.
final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null;
final TokenStream tvStream = TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1);
// We need to wrap in OffsetWindowTokenFilter if multi-valued
final OffsetWindowTokenFilter tvWindowStream;
if (tvStream != null && fieldValues.size() > 1) {
tvWindowStream = new OffsetWindowTokenFilter(tvStream);
} else {
tvWindowStream = null;
}
for (String thisText : fieldValues) {
if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) {
break;
}
TokenStream tstream;
if (tvWindowStream != null) {
// if we have a multi-valued field with term vectors, then get the next offset window
tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length());
} else if (tvStream != null) {
// single-valued with term vectors
tstream = tvStream;
} else {
// fall back to analyzer
tstream = createAnalyzerTStream(schemaField, thisText);
}
Highlighter highlighter;
if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
// We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream
// needs to implement reset() efficiently.
//If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary.
// It should be okay if OffsetLimit won't get applied in this case.
final TokenStream tempTokenStream;
if (tstream != tvStream) {
if (maxCharsToAnalyze >= thisText.length()) {
tempTokenStream = new CachingTokenFilter(tstream);
} else {
tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
}
} else {
tempTokenStream = tstream;
}
// get highlighter
highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream);
// if the CachingTokenFilter was consumed then use it going forward.
if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter) tempTokenStream).isCached()) {
tstream = tempTokenStream;
}
//tstream.reset(); not needed; getBestTextFragments will reset it.
} else {
// use "the old way"
highlighter = getHighlighter(query, fieldName, req);
}
highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
maxCharsToAnalyze -= thisText.length();
// Highlight!
try {
TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, thisText, mergeContiguousFragments, numFragments);
for (TextFragment bestTextFragment : bestTextFragments) {
if (//can happen via mergeContiguousFragments
bestTextFragment == null)
continue;
// normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless.
if (bestTextFragment.getScore() > 0 || preserveMulti) {
frags.add(bestTextFragment);
if (bestTextFragment.getScore() > 0)
// note: limits fragments (for multi-valued fields), not quite the number of values
--mvToMatch;
}
}
} catch (InvalidTokenOffsetsException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
}
// Put the fragments onto the Solr response (docSummaries)
if (frags.size() > 0) {
// sort such that the fragments with the highest score come first
if (!preserveMulti) {
Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore()));
}
// Truncate list to hl.snippets, but not when hl.preserveMulti
if (frags.size() > numFragments && !preserveMulti) {
frags = frags.subList(0, numFragments);
}
return getResponseForFragments(frags, req);
}
//no highlights for this field
return null;
}
use of org.apache.lucene.search.highlight.TextFragment in project jackrabbit-oak by apache.
the class LuceneIndex method getExcerpt.
private String getExcerpt(Analyzer analyzer, IndexSearcher searcher, ScoreDoc doc) throws IOException {
StringBuilder excerpt = new StringBuilder();
for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields()) {
String name = field.name();
// only full text or analyzed fields
if (name.startsWith(FieldNames.FULLTEXT) || name.startsWith(FieldNames.ANALYZED_FIELD_PREFIX)) {
String text = field.stringValue();
TokenStream tokenStream = analyzer.tokenStream(name, text);
try {
TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, true, 2);
if (textFragments != null && textFragments.length > 0) {
for (TextFragment fragment : textFragments) {
if (excerpt.length() > 0) {
excerpt.append("...");
}
excerpt.append(fragment.toString());
}
break;
}
} catch (InvalidTokenOffsetsException e) {
LOG.error("higlighting failed", e);
}
}
}
return excerpt.toString();
}
use of org.apache.lucene.search.highlight.TextFragment in project jackrabbit-oak by apache.
the class LucenePropertyIndex method getExcerpt.
private String getExcerpt(Query query, Analyzer analyzer, IndexSearcher searcher, ScoreDoc doc, FieldInfos fieldInfos) throws IOException {
StringBuilder excerpt = new StringBuilder();
int docID = doc.doc;
List<String> names = new LinkedList<String>();
for (IndexableField field : searcher.getIndexReader().document(docID).getFields()) {
String name = field.name();
// postings highlighter can be used on analyzed fields with docs, freqs, positions and offsets stored.
if (name.startsWith(ANALYZED_FIELD_PREFIX) && fieldInfos.hasProx() && fieldInfos.hasOffsets()) {
names.add(name);
}
}
if (names.size() > 0) {
int[] maxPassages = new int[names.size()];
for (int i = 0; i < maxPassages.length; i++) {
maxPassages[i] = 1;
}
try {
Map<String, String[]> stringMap = postingsHighlighter.highlightFields(names.toArray(new String[names.size()]), query, searcher, new int[] { docID }, maxPassages);
for (Map.Entry<String, String[]> entry : stringMap.entrySet()) {
String value = Arrays.toString(entry.getValue());
if (value.contains("<b>")) {
if (excerpt.length() > 0) {
excerpt.append("...");
}
excerpt.append(value);
}
}
} catch (Exception e) {
LOG.error("postings highlighting failed", e);
}
}
// fallback if no excerpt could be retrieved using postings highlighter
if (excerpt.length() == 0) {
for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields()) {
String name = field.name();
// only full text or analyzed fields
if (name.startsWith(FieldNames.FULLTEXT) || name.startsWith(FieldNames.ANALYZED_FIELD_PREFIX)) {
String text = field.stringValue();
TokenStream tokenStream = analyzer.tokenStream(name, text);
try {
TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, true, 1);
if (textFragments != null && textFragments.length > 0) {
for (TextFragment fragment : textFragments) {
if (excerpt.length() > 0) {
excerpt.append("...");
}
excerpt.append(fragment.toString());
}
break;
}
} catch (InvalidTokenOffsetsException e) {
LOG.error("higlighting failed", e);
}
}
}
}
return excerpt.toString();
}
use of org.apache.lucene.search.highlight.TextFragment in project jackrabbit-oak by apache.
the class LucenePropertyIndex method getExcerpt.
private Map<String, String> getExcerpt(Query query, Set<String> excerptFields, Analyzer analyzer, IndexSearcher searcher, ScoreDoc doc, FieldInfos fieldInfos) throws IOException {
Set<String> excerptFieldNames = Sets.newHashSet();
Map<String, String> fieldNameToColumnNameMap = Maps.newHashMap();
Map<String, String> columnNameToExcerpts = Maps.newHashMap();
Set<String> nodeExcerptColumns = Sets.newHashSet();
excerptFields.forEach(columnName -> {
String fieldName;
if (REP_EXCERPT.equals(columnName)) {
fieldName = EXCERPT_NODE_FIELD_NAME;
} else {
fieldName = columnName.substring(REP_EXCERPT.length() + 1, columnName.length() - 1);
}
if (!EXCERPT_NODE_FIELD_NAME.equals(fieldName)) {
excerptFieldNames.add(fieldName);
fieldNameToColumnNameMap.put(fieldName, columnName);
} else {
nodeExcerptColumns.add(columnName);
}
});
final boolean requireNodeLevelExcerpt = nodeExcerptColumns.size() > 0;
int docID = doc.doc;
List<String> names = new LinkedList<String>();
for (IndexableField field : searcher.getIndexReader().document(docID).getFields()) {
String name = field.name();
// postings highlighter can be used on analyzed fields with docs, freqs, positions and offsets stored.
if (name.startsWith(ANALYZED_FIELD_PREFIX) && fieldInfos.hasProx() && fieldInfos.hasOffsets()) {
names.add(name);
}
}
if (!requireNodeLevelExcerpt) {
names.retainAll(excerptFieldNames);
}
if (names.size() > 0) {
int[] maxPassages = new int[names.size()];
for (int i = 0; i < maxPassages.length; i++) {
maxPassages[i] = 1;
}
try {
Map<String, String[]> stringMap = postingsHighlighter.highlightFields(names.toArray(new String[names.size()]), query, searcher, new int[] { docID }, maxPassages);
for (Map.Entry<String, String[]> entry : stringMap.entrySet()) {
String value = Arrays.toString(entry.getValue());
if (value.contains("<b>")) {
String fieldName = entry.getKey();
String columnName = fieldNameToColumnNameMap.get(fieldName);
columnNameToExcerpts.put(columnName, value);
}
}
} catch (Exception e) {
LOG.error("postings highlighting failed", e);
}
}
// fallback if no excerpt could be retrieved using postings highlighter
if (columnNameToExcerpts.size() == 0) {
for (IndexableField field : searcher.getIndexReader().document(doc.doc).getFields()) {
String name = field.name();
// only full text or analyzed fields
if (name.startsWith(FieldNames.FULLTEXT) || name.startsWith(FieldNames.ANALYZED_FIELD_PREFIX)) {
String text = field.stringValue();
TokenStream tokenStream = analyzer.tokenStream(name, text);
try {
TextFragment[] textFragments = highlighter.getBestTextFragments(tokenStream, text, true, 1);
if (textFragments != null && textFragments.length > 0) {
for (TextFragment fragment : textFragments) {
String columnName = null;
if (name.startsWith(FieldNames.ANALYZED_FIELD_PREFIX)) {
columnName = fieldNameToColumnNameMap.get(name.substring(FieldNames.ANALYZED_FIELD_PREFIX.length()));
}
if (columnName == null && requireNodeLevelExcerpt) {
columnName = name;
}
if (columnName != null) {
columnNameToExcerpts.put(columnName, fragment.toString());
}
}
if (excerptFieldNames.size() == 0) {
break;
}
}
} catch (InvalidTokenOffsetsException e) {
LOG.error("higlighting failed", e);
}
}
}
}
if (requireNodeLevelExcerpt) {
String nodeExcerpt = Joiner.on("...").join(columnNameToExcerpts.values());
nodeExcerptColumns.forEach(nodeExcerptColumnName -> {
columnNameToExcerpts.put(nodeExcerptColumnName, nodeExcerpt);
});
}
columnNameToExcerpts.keySet().retainAll(excerptFields);
return columnNameToExcerpts;
}
Aggregations