use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class AnalyzingInfixSuggester method highlight.
/** Override this method to customize the Object
* representing a single highlighted suggestions; the
* result is set on each {@link
* org.apache.lucene.search.suggest.Lookup.LookupResult#highlightKey} member. */
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
ts.reset();
StringBuilder sb = new StringBuilder();
int upto = 0;
while (ts.incrementToken()) {
String token = termAtt.toString();
int startOffset = offsetAtt.startOffset();
int endOffset = offsetAtt.endOffset();
if (upto < startOffset) {
addNonMatch(sb, text.substring(upto, startOffset));
upto = startOffset;
} else if (upto > startOffset) {
continue;
}
if (matchedTokens.contains(token)) {
// Token matches.
addWholeMatch(sb, text.substring(startOffset, endOffset), token);
upto = endOffset;
} else if (prefixToken != null && token.startsWith(prefixToken)) {
addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken);
upto = endOffset;
}
}
ts.end();
int endOffset = offsetAtt.endOffset();
if (upto < endOffset) {
addNonMatch(sb, text.substring(upto));
}
return sb.toString();
}
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class AnalyzingInfixSuggester method lookup.
/**
* This is an advanced method providing the capability to send down to the suggester any
* arbitrary lucene query to be used to filter the result of the suggester
*
* @param key the keyword being looked for
* @param contextQuery an arbitrary Lucene query to be used to filter the result of the suggester. {@link #addContextToQuery} could be used to build this contextQuery.
* @param num number of items to return
* @param allTermsRequired all searched terms must match or not
* @param doHighlight if true, the matching term will be highlighted in the search result
* @return the result of the suggester
* @throws IOException f the is IO exception while reading data from the index
*/
public List<LookupResult> lookup(CharSequence key, BooleanQuery contextQuery, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
if (searcherMgr == null) {
throw new IllegalStateException("suggester was not built");
}
final BooleanClause.Occur occur;
if (allTermsRequired) {
occur = BooleanClause.Occur.MUST;
} else {
occur = BooleanClause.Occur.SHOULD;
}
BooleanQuery.Builder query;
Set<String> matchedTokens;
String prefixToken = null;
try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()))) {
//long t0 = System.currentTimeMillis();
ts.reset();
final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
String lastToken = null;
query = new BooleanQuery.Builder();
int maxEndOffset = -1;
matchedTokens = new HashSet<>();
while (ts.incrementToken()) {
if (lastToken != null) {
matchedTokens.add(lastToken);
query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur);
}
lastToken = termAtt.toString();
if (lastToken != null) {
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
}
}
ts.end();
if (lastToken != null) {
Query lastQuery;
if (maxEndOffset == offsetAtt.endOffset()) {
// Use PrefixQuery (or the ngram equivalent) when
// there was no trailing discarded chars in the
// string (e.g. whitespace), so that if query does
// not end with a space we show prefix matches for
// that token:
lastQuery = getLastTokenQuery(lastToken);
prefixToken = lastToken;
} else {
// Use TermQuery for an exact match if there were
// trailing discarded chars (e.g. whitespace), so
// that if query ends with a space we only show
// exact matches for that term:
matchedTokens.add(lastToken);
lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken));
}
if (lastQuery != null) {
query.add(lastQuery, occur);
}
}
if (contextQuery != null) {
boolean allMustNot = true;
for (BooleanClause clause : contextQuery.clauses()) {
if (clause.getOccur() != BooleanClause.Occur.MUST_NOT) {
allMustNot = false;
break;
}
}
if (allMustNot) {
// All are MUST_NOT: add the contextQuery to the main query instead (not as sub-query)
for (BooleanClause clause : contextQuery.clauses()) {
query.add(clause);
}
} else if (allTermsRequired == false) {
// We must carefully upgrade the query clauses to MUST:
BooleanQuery.Builder newQuery = new BooleanQuery.Builder();
newQuery.add(query.build(), BooleanClause.Occur.MUST);
newQuery.add(contextQuery, BooleanClause.Occur.MUST);
query = newQuery;
} else {
// Add contextQuery as sub-query
query.add(contextQuery, BooleanClause.Occur.MUST);
}
}
}
// TODO: we could allow blended sort here, combining
// weight w/ score. Now we ignore score and sort only
// by weight:
Query finalQuery = finishQuery(query, allTermsRequired);
//System.out.println("finalQuery=" + finalQuery);
// Sort by weight, descending:
TopFieldCollector c = TopFieldCollector.create(SORT, num, true, false, false);
// We sorted postings by weight during indexing, so we
// only retrieve the first num hits now:
Collector c2 = new EarlyTerminatingSortingCollector(c, SORT, num);
List<LookupResult> results = null;
SearcherManager mgr;
IndexSearcher searcher;
synchronized (searcherMgrLock) {
// acquire & release on same SearcherManager, via local reference
mgr = searcherMgr;
searcher = mgr.acquire();
}
try {
//System.out.println("got searcher=" + searcher);
searcher.search(finalQuery, c2);
TopFieldDocs hits = c.topDocs();
// Slower way if postings are not pre-sorted by weight:
// hits = searcher.search(query, null, num, SORT);
results = createResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken);
} finally {
mgr.release(searcher);
}
return results;
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class AnalysisRequestHandlerBase method getQueryTokenSet.
/**
* Analyzes the given text using the given analyzer and returns the produced tokens.
*
* @param query The query to analyze.
* @param analyzer The analyzer to use.
*/
protected Set<BytesRef> getQueryTokenSet(String query, Analyzer analyzer) {
try (TokenStream tokenStream = analyzer.tokenStream("", query)) {
final Set<BytesRef> tokens = new HashSet<>();
final TermToBytesRefAttribute bytesAtt = tokenStream.getAttribute(TermToBytesRefAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
tokens.add(BytesRef.deepCopyOf(bytesAtt.getBytesRef()));
}
tokenStream.end();
return tokens;
} catch (IOException ioe) {
throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
}
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class AnalysisRequestHandlerBase method analyzeValue.
/**
* Analyzes the given value using the given Analyzer.
*
* @param value Value to analyze
* @param context The {@link AnalysisContext analysis context}.
*
* @return NamedList containing the tokens produced by analyzing the given value
*/
protected NamedList<? extends Object> analyzeValue(String value, AnalysisContext context) {
Analyzer analyzer = context.getAnalyzer();
if (!TokenizerChain.class.isInstance(analyzer)) {
try (TokenStream tokenStream = analyzer.tokenStream(context.getFieldName(), value)) {
NamedList<List<NamedList>> namedList = new NamedList<>();
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(analyzeTokenStream(tokenStream), context));
return namedList;
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e);
}
}
TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
CharFilterFactory[] cfiltfacs = tokenizerChain.getCharFilterFactories();
TokenizerFactory tfac = tokenizerChain.getTokenizerFactory();
TokenFilterFactory[] filtfacs = tokenizerChain.getTokenFilterFactories();
NamedList<Object> namedList = new NamedList<>();
if (0 < cfiltfacs.length) {
String source = value;
for (CharFilterFactory cfiltfac : cfiltfacs) {
Reader reader = new StringReader(source);
reader = cfiltfac.create(reader);
source = writeCharStream(namedList, reader);
}
}
TokenStream tokenStream = tfac.create();
((Tokenizer) tokenStream).setReader(tokenizerChain.initReader(null, new StringReader(value)));
List<AttributeSource> tokens = analyzeTokenStream(tokenStream);
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
ListBasedTokenStream listBasedTokenStream = new ListBasedTokenStream(tokenStream, tokens);
for (TokenFilterFactory tokenFilterFactory : filtfacs) {
for (final AttributeSource tok : tokens) {
tok.getAttribute(TokenTrackingAttribute.class).freezeStage();
}
// overwrite the vars "tokenStream", "tokens", and "listBasedTokenStream"
tokenStream = tokenFilterFactory.create(listBasedTokenStream);
tokens = analyzeTokenStream(tokenStream);
namedList.add(tokenStream.getClass().getName(), convertTokensToNamedLists(tokens, context));
listBasedTokenStream = new ListBasedTokenStream(listBasedTokenStream, tokens);
}
return namedList;
}
use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.
the class TermVectorReusingLeafReader method doHighlightingByHighlighter.
/** Highlights and returns the highlight object for this field -- a String[] by default. Null if none. */
@SuppressWarnings("unchecked")
protected Object doHighlightingByHighlighter(Document doc, int docId, SchemaField schemaField, Query query, IndexReader reader, SolrQueryRequest req) throws IOException {
final SolrParams params = req.getParams();
final String fieldName = schemaField.getName();
final int mvToExamine = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, (schemaField.multiValued()) ? Integer.MAX_VALUE : 1);
// Technically this is the max *fragments* (snippets), not max values:
int mvToMatch = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE);
if (mvToExamine <= 0 || mvToMatch <= 0) {
return null;
}
int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS);
if (maxCharsToAnalyze < 0) {
//e.g. -1
maxCharsToAnalyze = Integer.MAX_VALUE;
}
List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req);
if (fieldValues.isEmpty()) {
return null;
}
// preserve order of values in a multiValued list
boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);
int numFragments = getMaxSnippets(fieldName, params);
boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
List<TextFragment> frags = new ArrayList<>();
//Try term vectors, which is faster
// note: offsets are minimally sufficient for this HL.
final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null;
final TokenStream tvStream = TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1);
// We need to wrap in OffsetWindowTokenFilter if multi-valued
final OffsetWindowTokenFilter tvWindowStream;
if (tvStream != null && fieldValues.size() > 1) {
tvWindowStream = new OffsetWindowTokenFilter(tvStream);
} else {
tvWindowStream = null;
}
for (String thisText : fieldValues) {
if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) {
break;
}
TokenStream tstream;
if (tvWindowStream != null) {
// if we have a multi-valued field with term vectors, then get the next offset window
tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length());
} else if (tvStream != null) {
// single-valued with term vectors
tstream = tvStream;
} else {
// fall back to analyzer
tstream = createAnalyzerTStream(schemaField, thisText);
}
Highlighter highlighter;
if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
// We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream
// needs to implement reset() efficiently.
//If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary.
// It should be okay if OffsetLimit won't get applied in this case.
final TokenStream tempTokenStream;
if (tstream != tvStream) {
if (maxCharsToAnalyze >= thisText.length()) {
tempTokenStream = new CachingTokenFilter(tstream);
} else {
tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
}
} else {
tempTokenStream = tstream;
}
// get highlighter
highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream);
// if the CachingTokenFilter was consumed then use it going forward.
if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter) tempTokenStream).isCached()) {
tstream = tempTokenStream;
}
//tstream.reset(); not needed; getBestTextFragments will reset it.
} else {
// use "the old way"
highlighter = getHighlighter(query, fieldName, req);
}
highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
maxCharsToAnalyze -= thisText.length();
// Highlight!
try {
TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, thisText, mergeContiguousFragments, numFragments);
for (TextFragment bestTextFragment : bestTextFragments) {
if (//can happen via mergeContiguousFragments
bestTextFragment == null)
continue;
// normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless.
if (bestTextFragment.getScore() > 0 || preserveMulti) {
frags.add(bestTextFragment);
if (bestTextFragment.getScore() > 0)
// note: limits fragments (for multi-valued fields), not quite the number of values
--mvToMatch;
}
}
} catch (InvalidTokenOffsetsException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
}
// Put the fragments onto the Solr response (docSummaries)
if (frags.size() > 0) {
// sort such that the fragments with the highest score come first
if (!preserveMulti) {
Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore()));
}
// Truncate list to hl.snippets, but not when hl.preserveMulti
if (frags.size() > numFragments && !preserveMulti) {
frags = frags.subList(0, numFragments);
}
return getResponseForFragments(frags, req);
}
//no highlights for this field
return null;
}
Aggregations