Examples with CachingTokenFilter - org.apache.lucene.analysis.CachingTokenFilter

Example 1 with CachingTokenFilter

use of org.apache.lucene.analysis.CachingTokenFilter in project lucene-solr by apache.

the class TermVectorReusingLeafReader method doHighlightingByHighlighter.

/** Highlights and returns the highlight object for this field -- a String[] by default. Null if none. */
@SuppressWarnings("unchecked")
protected Object doHighlightingByHighlighter(Document doc, int docId, SchemaField schemaField, Query query, IndexReader reader, SolrQueryRequest req) throws IOException {
    final SolrParams params = req.getParams();
    final String fieldName = schemaField.getName();
    final int mvToExamine = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, (schemaField.multiValued()) ? Integer.MAX_VALUE : 1);
    // Technically this is the max *fragments* (snippets), not max values:
    int mvToMatch = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE);
    if (mvToExamine <= 0 || mvToMatch <= 0) {
        return null;
    }
    int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS);
    if (maxCharsToAnalyze < 0) {
        //e.g. -1
        maxCharsToAnalyze = Integer.MAX_VALUE;
    }
    List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req);
    if (fieldValues.isEmpty()) {
        return null;
    }
    // preserve order of values in a multiValued list
    boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);
    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
    List<TextFragment> frags = new ArrayList<>();
    //Try term vectors, which is faster
    //  note: offsets are minimally sufficient for this HL.
    final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null;
    final TokenStream tvStream = TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1);
    //  We need to wrap in OffsetWindowTokenFilter if multi-valued
    final OffsetWindowTokenFilter tvWindowStream;
    if (tvStream != null && fieldValues.size() > 1) {
        tvWindowStream = new OffsetWindowTokenFilter(tvStream);
    } else {
        tvWindowStream = null;
    }
    for (String thisText : fieldValues) {
        if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) {
            break;
        }
        TokenStream tstream;
        if (tvWindowStream != null) {
            // if we have a multi-valued field with term vectors, then get the next offset window
            tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length());
        } else if (tvStream != null) {
            // single-valued with term vectors
            tstream = tvStream;
        } else {
            // fall back to analyzer
            tstream = createAnalyzerTStream(schemaField, thisText);
        }
        Highlighter highlighter;
        if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
            // We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream
            // needs to implement reset() efficiently.
            //If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary.
            //  It should be okay if OffsetLimit won't get applied in this case.
            final TokenStream tempTokenStream;
            if (tstream != tvStream) {
                if (maxCharsToAnalyze >= thisText.length()) {
                    tempTokenStream = new CachingTokenFilter(tstream);
                } else {
                    tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
                }
            } else {
                tempTokenStream = tstream;
            }
            // get highlighter
            highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream);
            // if the CachingTokenFilter was consumed then use it going forward.
            if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter) tempTokenStream).isCached()) {
                tstream = tempTokenStream;
            }
        //tstream.reset(); not needed; getBestTextFragments will reset it.
        } else {
            // use "the old way"
            highlighter = getHighlighter(query, fieldName, req);
        }
        highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
        maxCharsToAnalyze -= thisText.length();
        // Highlight!
        try {
            TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, thisText, mergeContiguousFragments, numFragments);
            for (TextFragment bestTextFragment : bestTextFragments) {
                if (//can happen via mergeContiguousFragments
                bestTextFragment == null)
                    continue;
                // normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless.
                if (bestTextFragment.getScore() > 0 || preserveMulti) {
                    frags.add(bestTextFragment);
                    if (bestTextFragment.getScore() > 0)
                        // note: limits fragments (for multi-valued fields), not quite the number of values
                        --mvToMatch;
                }
            }
        } catch (InvalidTokenOffsetsException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
    }
    // Put the fragments onto the Solr response (docSummaries)
    if (frags.size() > 0) {
        // sort such that the fragments with the highest score come first
        if (!preserveMulti) {
            Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore()));
        }
        // Truncate list to hl.snippets, but not when hl.preserveMulti
        if (frags.size() > numFragments && !preserveMulti) {
            frags = frags.subList(0, numFragments);
        }
        return getResponseForFragments(frags, req);
    }
    //no highlights for this field
    return null;
}

Also used : OffsetLimitTokenFilter(org.apache.lucene.search.highlight.OffsetLimitTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) TextFragment(org.apache.lucene.search.highlight.TextFragment) Fields(org.apache.lucene.index.Fields) CachingTokenFilter(org.apache.lucene.analysis.CachingTokenFilter) InvalidTokenOffsetsException(org.apache.lucene.search.highlight.InvalidTokenOffsetsException) SolrParams(org.apache.solr.common.params.SolrParams) MapSolrParams(org.apache.solr.common.params.MapSolrParams) SolrException(org.apache.solr.common.SolrException) Highlighter(org.apache.lucene.search.highlight.Highlighter) FastVectorHighlighter(org.apache.lucene.search.vectorhighlight.FastVectorHighlighter)

Example 2 with CachingTokenFilter

use of org.apache.lucene.analysis.CachingTokenFilter in project SearchServices by Alfresco.

the class AlfrescoSolrHighlighter method doHighlightingByHighlighter.

/**
 * Highlights and returns the highlight object for this field -- a String[] by default. Null if none.
 */
@SuppressWarnings("unchecked")
protected Object doHighlightingByHighlighter(Document doc, int docId, SchemaField schemaField, Query query, IndexReader reader, SolrQueryRequest req) throws IOException {
    final SolrParams params = req.getParams();
    final String fieldName = schemaField.getName();
    final int mvToExamine = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE, (schemaField.multiValued()) ? Integer.MAX_VALUE : 1);
    // Technically this is the max *fragments* (snippets), not max values:
    int mvToMatch = params.getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE);
    if (mvToExamine <= 0 || mvToMatch <= 0) {
        return null;
    }
    int maxCharsToAnalyze = params.getFieldInt(fieldName, HighlightParams.MAX_CHARS, DEFAULT_MAX_CHARS);
    if (maxCharsToAnalyze < 0) {
        // e.g. -1
        maxCharsToAnalyze = Integer.MAX_VALUE;
    }
    List<String> fieldValues = getFieldValues(doc, fieldName, mvToExamine, maxCharsToAnalyze, req);
    if (fieldValues.isEmpty()) {
        return null;
    }
    // preserve order of values in a multiValued list
    boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);
    int numFragments = getMaxSnippets(fieldName, params);
    boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
    List<TextFragment> frags = new ArrayList<>();
    // Try term vectors, which is faster
    // note: offsets are minimally sufficient for this HL.
    final Fields tvFields = schemaField.storeTermOffsets() ? reader.getTermVectors(docId) : null;
    final TokenStream tvStream = TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1);
    // We need to wrap in OffsetWindowTokenFilter if multi-valued
    final OffsetWindowTokenFilter tvWindowStream;
    if (tvStream != null && fieldValues.size() > 1) {
        tvWindowStream = new OffsetWindowTokenFilter(tvStream);
    } else {
        tvWindowStream = null;
    }
    for (String thisText : fieldValues) {
        if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) {
            break;
        }
        TokenStream tstream;
        if (tvWindowStream != null) {
            // if we have a multi-valued field with term vectors, then get the next offset window
            tstream = tvWindowStream.advanceToNextWindowOfLength(thisText.length());
        } else if (tvStream != null) {
            // single-valued with term vectors
            tstream = tvStream;
        } else {
            // fall back to analyzer
            tstream = createAnalyzerTStream(schemaField, thisText);
        }
        Highlighter highlighter;
        if (params.getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
            // We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream
            // needs to implement reset() efficiently.
            // If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary.
            // It should be okay if OffsetLimit won't get applied in this case.
            final TokenStream tempTokenStream;
            if (tstream != tvStream) {
                if (maxCharsToAnalyze >= thisText.length()) {
                    tempTokenStream = new CachingTokenFilter(tstream);
                } else {
                    tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
                }
            } else {
                tempTokenStream = tstream;
            }
            // get highlighter
            highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream);
            // if the CachingTokenFilter was consumed then use it going forward.
            if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter) tempTokenStream).isCached()) {
                tstream = tempTokenStream;
            }
        // tstream.reset(); not needed; getBestTextFragments will reset it.
        } else {
            // use "the old way"
            highlighter = getHighlighter(query, fieldName, req);
        }
        highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
        maxCharsToAnalyze -= thisText.length();
        // Highlight!
        try {
            TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, fixLocalisedText(thisText), mergeContiguousFragments, numFragments);
            for (TextFragment bestTextFragment : bestTextFragments) {
                if (// can happen via mergeContiguousFragments
                bestTextFragment == null)
                    continue;
                // normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless.
                if (bestTextFragment.getScore() > 0 || preserveMulti) {
                    frags.add(bestTextFragment);
                    if (bestTextFragment.getScore() > 0)
                        // note: limits fragments (for multi-valued fields), not quite the number of values
                        --mvToMatch;
                }
            }
        } catch (InvalidTokenOffsetsException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
    }
    // Put the fragments onto the Solr response (docSummaries)
    if (frags.size() > 0) {
        // sort such that the fragments with the highest score come first
        if (!preserveMulti) {
            Collections.sort(frags, (arg0, arg1) -> Float.compare(arg1.getScore(), arg0.getScore()));
        }
        // Truncate list to hl.snippets, but not when hl.preserveMulti
        if (frags.size() > numFragments && !preserveMulti) {
            frags = frags.subList(0, numFragments);
        }
        return getResponseForFragments(frags, req);
    }
    // no highlights for this field
    return null;
}

Also used : OffsetLimitTokenFilter(org.apache.lucene.search.highlight.OffsetLimitTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) TextFragment(org.apache.lucene.search.highlight.TextFragment) Fields(org.apache.lucene.index.Fields) CachingTokenFilter(org.apache.lucene.analysis.CachingTokenFilter) InvalidTokenOffsetsException(org.apache.lucene.search.highlight.InvalidTokenOffsetsException) SolrParams(org.apache.solr.common.params.SolrParams) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) SolrException(org.apache.solr.common.SolrException) Highlighter(org.apache.lucene.search.highlight.Highlighter) FastVectorHighlighter(org.apache.lucene.search.vectorhighlight.FastVectorHighlighter) SolrHighlighter(org.apache.solr.highlight.SolrHighlighter) DefaultSolrHighlighter(org.apache.solr.highlight.DefaultSolrHighlighter)

Example 3 with CachingTokenFilter

use of org.apache.lucene.analysis.CachingTokenFilter in project pyramid by cheng-li.

the class PhraseCountQueryBuilder method doToQuery.

protected Query doToQuery(QueryShardContext context) throws IOException {
    // Analyzer analyzer = context.getMapperService().searchAnalyzer();
    Analyzer analyzer = new WhitespaceAnalyzer();
    try (TokenStream source = analyzer.tokenStream(fieldName, value.toString())) {
        CachingTokenFilter stream = new CachingTokenFilter(new LowerCaseFilter(source));
        TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
        if (termAtt == null) {
            return null;
        }
        List<CustomSpanTermQuery> clauses = new ArrayList<>();
        stream.reset();
        while (stream.incrementToken()) {
            Term term = new Term(fieldName, termAtt.getBytesRef());
            clauses.add(new CustomSpanTermQuery(term));
        }
        return new PhraseCountQuery(clauses.toArray(new CustomSpanTermQuery[clauses.size()]), slop, inOrder, weightedCount);
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing query text", e);
    }
}

Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) CachingTokenFilter(org.apache.lucene.analysis.CachingTokenFilter) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 4 with CachingTokenFilter

use of org.apache.lucene.analysis.CachingTokenFilter in project janusgraph by JanusGraph.

the class LuceneIndex method customTokenize.

// adapted from SolrIndex
private List<List<String>> customTokenize(Analyzer analyzer, String fieldName, String value) {
    Map<Integer, List<String>> stemsByOffset = new HashMap<>();
    try (CachingTokenFilter stream = new CachingTokenFilter(analyzer.tokenStream(fieldName, value))) {
        final OffsetAttribute offsetAtt = stream.getAttribute(OffsetAttribute.class);
        final TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            int offset = offsetAtt.startOffset();
            String stem = termAtt.getBytesRef().utf8ToString();
            List<String> stemList = stemsByOffset.get(offset);
            if (stemList == null) {
                stemList = new ArrayList<>();
                stemsByOffset.put(offset, stemList);
            }
            stemList.add(stem);
        }
        return new ArrayList<>(stemsByOffset.values());
    } catch (IOException e) {
        throw new IllegalArgumentException(e.getMessage(), e);
    }
}

Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) IOException(java.io.IOException) DoublePoint(org.apache.lucene.document.DoublePoint) LongPoint(org.apache.lucene.document.LongPoint) IntPoint(org.apache.lucene.document.IntPoint) CachingTokenFilter(org.apache.lucene.analysis.CachingTokenFilter) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) ArrayList(java.util.ArrayList) List(java.util.List)

Example 5 with CachingTokenFilter

use of org.apache.lucene.analysis.CachingTokenFilter in project lucene-solr by apache.

the class TestTeeSinkTokenFilter method testMultipleSources.

public void testMultipleSources() throws Exception {
    final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer1.toString()));
    final TokenStream source1 = new CachingTokenFilter(tee1);
    tee1.addAttribute(CheckClearAttributesAttribute.class);
    MockTokenizer tokenizer = new MockTokenizer(tee1.getAttributeFactory(), MockTokenizer.WHITESPACE, false);
    tokenizer.setReader(new StringReader(buffer2.toString()));
    final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(tokenizer);
    final TokenStream source2 = tee2;
    assertTokenStreamContents(source1, tokens1);
    assertTokenStreamContents(source2, tokens2);
    TokenStream lowerCasing = new LowerCaseFilter(source1);
    String[] lowerCaseTokens = new String[tokens1.length];
    for (int i = 0; i < tokens1.length; i++) lowerCaseTokens[i] = tokens1[i].toLowerCase(Locale.ROOT);
    assertTokenStreamContents(lowerCasing, lowerCaseTokens);
}

Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) CachingTokenFilter(org.apache.lucene.analysis.CachingTokenFilter) StringReader(java.io.StringReader) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Aggregations

CachingTokenFilter (org.apache.lucene.analysis.CachingTokenFilter)12 TokenStream (org.apache.lucene.analysis.TokenStream)8 ArrayList (java.util.ArrayList)7 IOException (java.io.IOException)6 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)4 Analyzer (org.apache.lucene.analysis.Analyzer)3 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)3 StringReader (java.io.StringReader)2 LinkedList (java.util.LinkedList)2 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)2 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)2 Document (org.apache.lucene.document.Document)2 Field (org.apache.lucene.document.Field)2 StoredField (org.apache.lucene.document.StoredField)2 TextField (org.apache.lucene.document.TextField)2 Fields (org.apache.lucene.index.Fields)2 BooleanQueryNode (org.apache.lucene.queryparser.flexible.core.nodes.BooleanQueryNode)2 FieldQueryNode (org.apache.lucene.queryparser.flexible.core.nodes.FieldQueryNode)2 FuzzyQueryNode (org.apache.lucene.queryparser.flexible.core.nodes.FuzzyQueryNode)2