use of org.apache.lucene.search.highlight.TextFragment in project SearchServices by Alfresco.
the class AlfrescoSolrHighlighter method doHighlightingByHighlighter.
/**
* Highlights and returns the highlight object for this field -- a String[] by default. Null if none.
*/
@SuppressWarnings("unchecked")
protected Object doHighlightingByHighlighter(Document doc, int docId, SchemaField schemaField, Query query, IndexReader reader, SolrQueryRequest req) throws IOException {
final SolrParams params = req.getParams();
final String fieldName = schemaField.getName();
final int mvToExamine = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, (schemaField.multiValued()) ? Integer.MAX_VALUE : 1);
// Technically this is the max *fragments* (snippets), not max values:
int mvToMatch = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE);
if (mvToExamine <= 0 || mvToMatch <= 0) {
return null;
}
int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS);
if (maxCharsToAnalyze < 0) {
// e.g. -1
maxCharsToAnalyze = Integer.MAX_VALUE;
}
List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req);
if (fieldValues.isEmpty()) {
return null;
}
// preserve order of values in a multiValued list
boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);
int numFragments = getMaxSnippets(fieldName, params);
boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
List<TextFragment> frags = new ArrayList<>();
// Try term vectors, which is faster
// note: offsets are minimally sufficient for this HL.
final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null;
final TokenStream tvStream = TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1);
// We need to wrap in OffsetWindowTokenFilter if multi-valued
final OffsetWindowTokenFilter tvWindowStream;
if (tvStream != null && fieldValues.size() > 1) {
tvWindowStream = new OffsetWindowTokenFilter(tvStream);
} else {
tvWindowStream = null;
}
for (String thisText : fieldValues) {
if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) {
break;
}
TokenStream tstream;
if (tvWindowStream != null) {
// if we have a multi-valued field with term vectors, then get the next offset window
tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length());
} else if (tvStream != null) {
// single-valued with term vectors
tstream = tvStream;
} else {
// fall back to analyzer
tstream = createAnalyzerTStream(schemaField, thisText);
}
Highlighter highlighter;
if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
// We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream
// needs to implement reset() efficiently.
// If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary.
// It should be okay if OffsetLimit won't get applied in this case.
final TokenStream tempTokenStream;
if (tstream != tvStream) {
if (maxCharsToAnalyze >= thisText.length()) {
tempTokenStream = new CachingTokenFilter(tstream);
} else {
tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
}
} else {
tempTokenStream = tstream;
}
// get highlighter
highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream);
// if the CachingTokenFilter was consumed then use it going forward.
if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter) tempTokenStream).isCached()) {
tstream = tempTokenStream;
}
// tstream.reset(); not needed; getBestTextFragments will reset it.
} else {
// use "the old way"
highlighter = getHighlighter(query, fieldName, req);
}
highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
maxCharsToAnalyze -= thisText.length();
// Highlight!
try {
TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, fixLocalisedText(thisText), mergeContiguousFragments, numFragments);
for (TextFragment bestTextFragment : bestTextFragments) {
if (// can happen via mergeContiguousFragments
bestTextFragment == null)
continue;
// normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless.
if (bestTextFragment.getScore() > 0 || preserveMulti) {
frags.add(bestTextFragment);
if (bestTextFragment.getScore() > 0)
// note: limits fragments (for multi-valued fields), not quite the number of values
--mvToMatch;
}
}
} catch (InvalidTokenOffsetsException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
}
// Put the fragments onto the Solr response (docSummaries)
if (frags.size() > 0) {
// sort such that the fragments with the highest score come first
if (!preserveMulti) {
Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore()));
}
// Truncate list to hl.snippets, but not when hl.preserveMulti
if (frags.size() > numFragments && !preserveMulti) {
frags = frags.subList(0, numFragments);
}
return getResponseForFragments(frags, req);
}
// no highlights for this field
return null;
}
use of org.apache.lucene.search.highlight.TextFragment in project jena by apache.
the class TextIndexLucene method frags2string.
private String frags2string(final TextFragment[] frags, final HighlightOpts opts) {
final StringBuilder sb = new StringBuilder();
String sep = "";
for (final TextFragment f : frags) {
final String fragStr = f.toString();
log.trace("found fragment {}", f);
sb.append(sep);
sb.append(opts.joinHi ? fragStr.replaceAll(opts.patternExpr, "$1") : fragStr);
sep = opts.fragSep;
}
return sb.toString();
}
use of org.apache.lucene.search.highlight.TextFragment in project zeppelin by apache.
the class LuceneSearch method doSearch.
private List<Map<String, String>> doSearch(IndexSearcher searcher, Query query, Analyzer analyzer, Highlighter highlighter) {
List<Map<String, String>> matchingParagraphs = new ArrayList<>();
ScoreDoc[] hits;
try {
hits = searcher.search(query, 20).scoreDocs;
for (int i = 0; i < hits.length; i++) {
LOGGER.debug("doc={} score={}", hits[i].doc, hits[i].score);
int id = hits[i].doc;
Document doc = searcher.doc(id);
String path = doc.get(ID_FIELD);
if (path != null) {
LOGGER.debug("{}. {}", (i + 1), path);
String title = doc.get("title");
if (title != null) {
LOGGER.debug(" Title: {}", doc.get("title"));
}
String text = doc.get(SEARCH_FIELD_TEXT);
String header = doc.get(SEARCH_FIELD_TITLE);
String fragment = "";
if (text != null) {
TokenStream tokenStream = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TEXT, analyzer);
TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, text, true, 3);
LOGGER.debug(" {} fragments found for query '{}'", frags.length, query);
for (TextFragment frag : frags) {
if ((frag != null) && (frag.getScore() > 0)) {
LOGGER.debug(" Fragment: {}", frag);
}
}
fragment = (frags != null && frags.length > 0) ? frags[0].toString() : "";
}
if (header != null) {
TokenStream tokenTitle = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TITLE, analyzer);
TextFragment[] frgTitle = highlighter.getBestTextFragments(tokenTitle, header, true, 3);
header = (frgTitle != null && frgTitle.length > 0) ? frgTitle[0].toString() : "";
} else {
header = "";
}
matchingParagraphs.add(ImmutableMap.of(// <noteId>/paragraph/<paragraphId>
"id", // <noteId>/paragraph/<paragraphId>
path, "name", title, "snippet", fragment, "text", text, "header", header));
} else {
LOGGER.info("{}. No {} for this document", i + 1, ID_FIELD);
}
}
} catch (IOException | InvalidTokenOffsetsException e) {
LOGGER.error("Exception on searching for {}", query, e);
}
return matchingParagraphs;
}
use of org.apache.lucene.search.highlight.TextFragment in project SSM by Intel-bigdata.
the class LuceneSearch method doSearch.
private List<Map<String, String>> doSearch(IndexSearcher searcher, Query query, Analyzer analyzer, Highlighter highlighter) {
List<Map<String, String>> matchingParagraphs = Lists.newArrayList();
ScoreDoc[] hits;
try {
hits = searcher.search(query, 20).scoreDocs;
for (int i = 0; i < hits.length; i++) {
LOG.debug("doc={} score={}", hits[i].doc, hits[i].score);
int id = hits[i].doc;
Document doc = searcher.doc(id);
String path = doc.get(ID_FIELD);
if (path != null) {
LOG.debug((i + 1) + ". " + path);
String title = doc.get("title");
if (title != null) {
LOG.debug(" Title: {}", doc.get("title"));
}
String text = doc.get(SEARCH_FIELD_TEXT);
String header = doc.get(SEARCH_FIELD_TITLE);
String fragment = "";
if (text != null) {
TokenStream tokenStream = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TEXT, analyzer);
TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, true, 3);
LOG.debug(" {} fragments found for query '{}'", frag.length, query);
for (int j = 0; j < frag.length; j++) {
if ((frag[j] != null) && (frag[j].getScore() > 0)) {
LOG.debug(" Fragment: {}", frag[j].toString());
}
}
fragment = (frag != null && frag.length > 0) ? frag[0].toString() : "";
}
if (header != null) {
TokenStream tokenTitle = TokenSources.getTokenStream(searcher.getIndexReader(), id, SEARCH_FIELD_TITLE, analyzer);
TextFragment[] frgTitle = highlighter.getBestTextFragments(tokenTitle, header, true, 3);
header = (frgTitle != null && frgTitle.length > 0) ? frgTitle[0].toString() : "";
} else {
header = "";
}
matchingParagraphs.add(// <noteId>/paragraph/<paragraphId>
ImmutableMap.of(// <noteId>/paragraph/<paragraphId>
"id", // <noteId>/paragraph/<paragraphId>
path, "name", title, "snippet", fragment, "text", text, "header", header));
} else {
LOG.info("{}. No {} for this document", i + 1, ID_FIELD);
}
}
} catch (IOException | InvalidTokenOffsetsException e) {
LOG.error("Exception on searching for {}", query, e);
}
return matchingParagraphs;
}
use of org.apache.lucene.search.highlight.TextFragment in project jena by apache.
the class TextIndexLucene method highlightResults.
private List<TextHit> highlightResults(ScoreDoc[] sDocs, IndexSearcher indexSearcher, Query query, List<String> fields, String highlight, String queryLang) throws IOException, InvalidTokenOffsetsException {
List<TextHit> results = new ArrayList<>();
HighlightOpts opts = new HighlightOpts(highlight);
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter(opts.start, opts.end);
Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(opts.fragSize));
for (ScoreDoc sd : sDocs) {
Document doc = indexSearcher.doc(sd.doc);
String entity = doc.get(docDef.getEntityField());
Node literal = null;
String field = getDocField(doc, fields);
String lexical = doc.get(field);
Collection<Node> props = docDef.getPredicates(field);
// pick one - should be only one normally
Node prop = props.isEmpty() ? null : props.iterator().next();
String docLang = doc.get(docDef.getLangField());
String effectiveField = queryLang != null ? field + "_" + Util.getEffectiveLang(docLang, queryLang) : field;
log.trace("highlightResults[{}]: {}, field: {}, lexical: {}, docLang: {}, effectiveField: {}", sd.doc, doc, field, lexical, docLang, effectiveField);
if (lexical != null) {
TokenStream tokenStream = indexAnalyzer.tokenStream(effectiveField, lexical);
log.trace("tokenStream: {}", tokenStream.toString());
TextFragment[] frags = highlighter.getBestTextFragments(tokenStream, lexical, opts.joinFrags, opts.maxFrags);
String rez = frags2string(frags, opts);
log.trace("result: {}, #frags: {}", rez, frags.length);
literal = NodeFactory.createLiteral(rez, docLang);
}
String graf = docDef.getGraphField() != null ? doc.get(docDef.getGraphField()) : null;
Node graph = graf != null ? TextQueryFuncs.stringToNode(graf) : null;
Node subject = TextQueryFuncs.stringToNode(entity);
TextHit hit = new TextHit(subject, sd.score, literal, graph, prop);
results.add(hit);
}
return results;
}
Aggregations