Examples with OffsetAttribute - org.apache.lucene.analysis.tokenattributes.OffsetAttribute

Example 11 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class FreeTextSuggester method lookup.

/** Retrieve suggestions. */
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, int num) throws IOException {
    if (contexts != null) {
        throw new IllegalArgumentException("this suggester doesn't support contexts");
    }
    if (fst == null) {
        throw new IllegalStateException("Lookup not supported at this time");
    }
    try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
        TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        BytesRefBuilder[] lastTokens = new BytesRefBuilder[grams];
        //System.out.println("lookup: key='" + key + "'");
        // Run full analysis, but save only the
        // last 1gram, last 2gram, etc.:
        int maxEndOffset = -1;
        boolean sawRealToken = false;
        while (ts.incrementToken()) {
            BytesRef tokenBytes = termBytesAtt.getBytesRef();
            sawRealToken |= tokenBytes.length > 0;
            // TODO: this is somewhat iffy; today, ShingleFilter
            // sets posLen to the gram count; maybe we should make
            // a separate dedicated att for this?
            int gramCount = posLenAtt.getPositionLength();
            assert gramCount <= grams;
            // Safety: make sure the recalculated count "agrees":
            if (countGrams(tokenBytes) != gramCount) {
                throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
            }
            maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
            BytesRefBuilder b = new BytesRefBuilder();
            b.append(tokenBytes);
            lastTokens[gramCount - 1] = b;
        }
        ts.end();
        if (!sawRealToken) {
            throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
        }
        // Carefully fill last tokens with _ tokens;
        // ShingleFilter appraently won't emit "only hole"
        // tokens:
        int endPosInc = posIncAtt.getPositionIncrement();
        // Note this will also be true if input is the empty
        // string (in which case we saw no tokens and
        // maxEndOffset is still -1), which in fact works out OK
        // because we fill the unigram with an empty BytesRef
        // below:
        boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
        if (lastTokenEnded) {
            // starting with "foo":
            for (int i = grams - 1; i > 0; i--) {
                BytesRefBuilder token = lastTokens[i - 1];
                if (token == null) {
                    continue;
                }
                token.append(separator);
                lastTokens[i] = token;
            }
            lastTokens[0] = new BytesRefBuilder();
        }
        Arc<Long> arc = new Arc<>();
        BytesReader bytesReader = fst.getBytesReader();
        // Try highest order models first, and if they return
        // results, return that; else, fallback:
        double backoff = 1.0;
        List<LookupResult> results = new ArrayList<>(num);
        // We only add a given suffix once, from the highest
        // order model that saw it; for subsequent lower order
        // models we skip it:
        final Set<BytesRef> seen = new HashSet<>();
        for (int gram = grams - 1; gram >= 0; gram--) {
            BytesRefBuilder token = lastTokens[gram];
            // Don't make unigram predictions from empty string:
            if (token == null || (token.length() == 0 && key.length() > 0)) {
                //System.out.println("  gram=" + gram + ": skip: not enough input");
                continue;
            }
            if (endPosInc > 0 && gram <= endPosInc) {
                //System.out.println("  break: only holes now");
                break;
            }
            //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
            // TODO: we could add fuzziness here
            // match the prefix portion exactly
            //Pair<Long,BytesRef> prefixOutput = null;
            Long prefixOutput = null;
            try {
                prefixOutput = lookupPrefix(fst, bytesReader, token.get(), arc);
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
            if (prefixOutput == null) {
                // This model never saw this prefix, e.g. the
                // trigram model never saw context "purple mushroom"
                backoff *= ALPHA;
                continue;
            }
            // TODO: we could do this division at build time, and
            // bake it into the FST?
            // Denominator for computing scores from current
            // model's predictions:
            long contextCount = totTokens;
            BytesRef lastTokenFragment = null;
            for (int i = token.length() - 1; i >= 0; i--) {
                if (token.byteAt(i) == separator) {
                    BytesRef context = new BytesRef(token.bytes(), 0, i);
                    Long output = Util.get(fst, Util.toIntsRef(context, new IntsRefBuilder()));
                    assert output != null;
                    contextCount = decodeWeight(output);
                    lastTokenFragment = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
                    break;
                }
            }
            final BytesRefBuilder finalLastToken = new BytesRefBuilder();
            if (lastTokenFragment == null) {
                finalLastToken.copyBytes(token.get());
            } else {
                finalLastToken.copyBytes(lastTokenFragment);
            }
            CharsRefBuilder spare = new CharsRefBuilder();
            // complete top-N
            TopResults<Long> completions = null;
            try {
                // Because we store multiple models in one FST
                // (1gram, 2gram, 3gram), we must restrict the
                // search so that it only considers the current
                // model.  For highest order model, this is not
                // necessary since all completions in the FST
                // must be from this model, but for lower order
                // models we have to filter out the higher order
                // ones:
                // Must do num+seen.size() for queue depth because we may
                // reject up to seen.size() paths in acceptResult():
                Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num + seen.size(), weightComparator) {

                    BytesRefBuilder scratchBytes = new BytesRefBuilder();

                    @Override
                    protected void addIfCompetitive(Util.FSTPath<Long> path) {
                        if (path.arc.label != separator) {
                            //System.out.println("    keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                            super.addIfCompetitive(path);
                        } else {
                        //System.out.println("    prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
                        }
                    }

                    @Override
                    protected boolean acceptResult(IntsRef input, Long output) {
                        Util.toBytesRef(input, scratchBytes);
                        finalLastToken.grow(finalLastToken.length() + scratchBytes.length());
                        int lenSav = finalLastToken.length();
                        finalLastToken.append(scratchBytes);
                        //System.out.println("    accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
                        boolean ret = seen.contains(finalLastToken.get()) == false;
                        finalLastToken.setLength(lenSav);
                        return ret;
                    }
                };
                // since this search is initialized with a single start node 
                // it is okay to start with an empty input path here
                searcher.addStartPaths(arc, prefixOutput, true, new IntsRefBuilder());
                completions = searcher.search();
                assert completions.isComplete;
            } catch (IOException bogus) {
                throw new RuntimeException(bogus);
            }
            int prefixLength = token.length();
            BytesRefBuilder suffix = new BytesRefBuilder();
            nextCompletion: for (Result<Long> completion : completions) {
                token.setLength(prefixLength);
                // append suffix
                Util.toBytesRef(completion.input, suffix);
                token.append(suffix);
                //System.out.println("    completion " + token.utf8ToString());
                // Skip this path if a higher-order model already
                // saw/predicted its last token:
                BytesRef lastToken = token.get();
                for (int i = token.length() - 1; i >= 0; i--) {
                    if (token.byteAt(i) == separator) {
                        assert token.length() - i - 1 > 0;
                        lastToken = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
                        break;
                    }
                }
                if (seen.contains(lastToken)) {
                    //System.out.println("      skip dup " + lastToken.utf8ToString());
                    continue nextCompletion;
                }
                seen.add(BytesRef.deepCopyOf(lastToken));
                spare.copyUTF8Bytes(token.get());
                LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
                results.add(result);
                assert results.size() == seen.size();
            //System.out.println("  add result=" + result);
            }
            backoff *= ALPHA;
        }
        Collections.sort(results, new Comparator<LookupResult>() {

            @Override
            public int compare(LookupResult a, LookupResult b) {
                if (a.value > b.value) {
                    return -1;
                } else if (a.value < b.value) {
                    return 1;
                } else {
                    // Tie break by UTF16 sort order:
                    return ((String) a.key).compareTo((String) b.key);
                }
            }
        });
        if (results.size() > num) {
            results.subList(num, results.size()).clear();
        }
        return results;
    }
}

Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) Util(org.apache.lucene.util.fst.Util) CodecUtil(org.apache.lucene.codecs.CodecUtil) Result(org.apache.lucene.util.fst.Util.Result) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRef(org.apache.lucene.util.IntsRef) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IOException(java.io.IOException) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) BytesReader(org.apache.lucene.util.fst.FST.BytesReader) Arc(org.apache.lucene.util.fst.FST.Arc) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 12 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class AnalyzingInfixSuggester method highlight.

/** Override this method to customize the Object
   *  representing a single highlighted suggestions; the
   *  result is set on each {@link
   *  org.apache.lucene.search.suggest.Lookup.LookupResult#highlightKey} member. */
protected Object highlight(String text, Set<String> matchedTokens, String prefixToken) throws IOException {
    try (TokenStream ts = queryAnalyzer.tokenStream("text", new StringReader(text))) {
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        ts.reset();
        StringBuilder sb = new StringBuilder();
        int upto = 0;
        while (ts.incrementToken()) {
            String token = termAtt.toString();
            int startOffset = offsetAtt.startOffset();
            int endOffset = offsetAtt.endOffset();
            if (upto < startOffset) {
                addNonMatch(sb, text.substring(upto, startOffset));
                upto = startOffset;
            } else if (upto > startOffset) {
                continue;
            }
            if (matchedTokens.contains(token)) {
                // Token matches.
                addWholeMatch(sb, text.substring(startOffset, endOffset), token);
                upto = endOffset;
            } else if (prefixToken != null && token.startsWith(prefixToken)) {
                addPrefixMatch(sb, text.substring(startOffset, endOffset), token, prefixToken);
                upto = endOffset;
            }
        }
        ts.end();
        int endOffset = offsetAtt.endOffset();
        if (upto < endOffset) {
            addNonMatch(sb, text.substring(upto));
        }
        return sb.toString();
    }
}

Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 13 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class AnalyzingInfixSuggester method lookup.

/**
   * This is an advanced method providing the capability to send down to the suggester any 
   * arbitrary lucene query to be used to filter the result of the suggester
   * 
   * @param key the keyword being looked for
   * @param contextQuery an arbitrary Lucene query to be used to filter the result of the suggester. {@link #addContextToQuery} could be used to build this contextQuery.
   * @param num number of items to return
   * @param allTermsRequired all searched terms must match or not
   * @param doHighlight if true, the matching term will be highlighted in the search result
   * @return the result of the suggester
   * @throws IOException f the is IO exception while reading data from the index
   */
public List<LookupResult> lookup(CharSequence key, BooleanQuery contextQuery, int num, boolean allTermsRequired, boolean doHighlight) throws IOException {
    if (searcherMgr == null) {
        throw new IllegalStateException("suggester was not built");
    }
    final BooleanClause.Occur occur;
    if (allTermsRequired) {
        occur = BooleanClause.Occur.MUST;
    } else {
        occur = BooleanClause.Occur.SHOULD;
    }
    BooleanQuery.Builder query;
    Set<String> matchedTokens;
    String prefixToken = null;
    try (TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()))) {
        //long t0 = System.currentTimeMillis();
        ts.reset();
        final CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        String lastToken = null;
        query = new BooleanQuery.Builder();
        int maxEndOffset = -1;
        matchedTokens = new HashSet<>();
        while (ts.incrementToken()) {
            if (lastToken != null) {
                matchedTokens.add(lastToken);
                query.add(new TermQuery(new Term(TEXT_FIELD_NAME, lastToken)), occur);
            }
            lastToken = termAtt.toString();
            if (lastToken != null) {
                maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
            }
        }
        ts.end();
        if (lastToken != null) {
            Query lastQuery;
            if (maxEndOffset == offsetAtt.endOffset()) {
                // Use PrefixQuery (or the ngram equivalent) when
                // there was no trailing discarded chars in the
                // string (e.g. whitespace), so that if query does
                // not end with a space we show prefix matches for
                // that token:
                lastQuery = getLastTokenQuery(lastToken);
                prefixToken = lastToken;
            } else {
                // Use TermQuery for an exact match if there were
                // trailing discarded chars (e.g. whitespace), so
                // that if query ends with a space we only show
                // exact matches for that term:
                matchedTokens.add(lastToken);
                lastQuery = new TermQuery(new Term(TEXT_FIELD_NAME, lastToken));
            }
            if (lastQuery != null) {
                query.add(lastQuery, occur);
            }
        }
        if (contextQuery != null) {
            boolean allMustNot = true;
            for (BooleanClause clause : contextQuery.clauses()) {
                if (clause.getOccur() != BooleanClause.Occur.MUST_NOT) {
                    allMustNot = false;
                    break;
                }
            }
            if (allMustNot) {
                // All are MUST_NOT: add the contextQuery to the main query instead (not as sub-query)
                for (BooleanClause clause : contextQuery.clauses()) {
                    query.add(clause);
                }
            } else if (allTermsRequired == false) {
                // We must carefully upgrade the query clauses to MUST:
                BooleanQuery.Builder newQuery = new BooleanQuery.Builder();
                newQuery.add(query.build(), BooleanClause.Occur.MUST);
                newQuery.add(contextQuery, BooleanClause.Occur.MUST);
                query = newQuery;
            } else {
                // Add contextQuery as sub-query
                query.add(contextQuery, BooleanClause.Occur.MUST);
            }
        }
    }
    // TODO: we could allow blended sort here, combining
    // weight w/ score.  Now we ignore score and sort only
    // by weight:
    Query finalQuery = finishQuery(query, allTermsRequired);
    //System.out.println("finalQuery=" + finalQuery);
    // Sort by weight, descending:
    TopFieldCollector c = TopFieldCollector.create(SORT, num, true, false, false);
    // We sorted postings by weight during indexing, so we
    // only retrieve the first num hits now:
    Collector c2 = new EarlyTerminatingSortingCollector(c, SORT, num);
    List<LookupResult> results = null;
    SearcherManager mgr;
    IndexSearcher searcher;
    synchronized (searcherMgrLock) {
        // acquire & release on same SearcherManager, via local reference
        mgr = searcherMgr;
        searcher = mgr.acquire();
    }
    try {
        //System.out.println("got searcher=" + searcher);
        searcher.search(finalQuery, c2);
        TopFieldDocs hits = c.topDocs();
        // Slower way if postings are not pre-sorted by weight:
        // hits = searcher.search(query, null, num, SORT);
        results = createResults(searcher, hits, num, key, doHighlight, matchedTokens, prefixToken);
    } finally {
        mgr.release(searcher);
    }
    return results;
}

Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) BooleanQuery(org.apache.lucene.search.BooleanQuery) TokenStream(org.apache.lucene.analysis.TokenStream) Query(org.apache.lucene.search.Query) PrefixQuery(org.apache.lucene.search.PrefixQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) TopFieldDocs(org.apache.lucene.search.TopFieldDocs) SearcherManager(org.apache.lucene.search.SearcherManager) StringReader(java.io.StringReader) EarlyTerminatingSortingCollector(org.apache.lucene.search.EarlyTerminatingSortingCollector) TopFieldCollector(org.apache.lucene.search.TopFieldCollector) Collector(org.apache.lucene.search.Collector) TopFieldCollector(org.apache.lucene.search.TopFieldCollector) TermQuery(org.apache.lucene.search.TermQuery) Occur(org.apache.lucene.search.BooleanClause.Occur) Term(org.apache.lucene.index.Term) BooleanClause(org.apache.lucene.search.BooleanClause) EarlyTerminatingSortingCollector(org.apache.lucene.search.EarlyTerminatingSortingCollector) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 14 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class FragmentQueue method getBestTextFragments.

/**
   * Low level api to get the most relevant (formatted) sections of the document.
   * This method has been made public to allow visibility of score information held in TextFragment objects.
   * Thanks to Jason Calabrese for help in redefining the interface.
   * @throws IOException If there is a low-level I/O error
   * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
   */
public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text, boolean mergeContiguousFragments, int maxNumFragments) throws IOException, InvalidTokenOffsetsException {
    ArrayList<TextFragment> docFrags = new ArrayList<>();
    StringBuilder newText = new StringBuilder();
    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
    }
    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);
    FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
    try {
        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        textFragmenter.start(text, tokenStream);
        TokenGroup tokenGroup = new TokenGroup(tokenStream);
        tokenStream.reset();
        for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {
            if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) {
                throw new InvalidTokenOffsetsException("Token " + termAtt.toString() + " exceeds length of provided text sized " + text.length());
            }
            if ((tokenGroup.getNumTokens() > 0) && (tokenGroup.isDistinct())) {
                //the current token is distinct from previous tokens -
                // markup the cached token group info
                startOffset = tokenGroup.getStartOffset();
                endOffset = tokenGroup.getEndOffset();
                tokenText = text.substring(startOffset, endOffset);
                String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
                //store any whitespace etc from between this and last group
                if (startOffset > lastEndOffset)
                    newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
                newText.append(markedUpText);
                lastEndOffset = Math.max(endOffset, lastEndOffset);
                tokenGroup.clear();
                //check if current token marks the start of a new fragment
                if (textFragmenter.isNewFragment()) {
                    currentFrag.setScore(fragmentScorer.getFragmentScore());
                    //record stats for a new fragment
                    currentFrag.textEndPos = newText.length();
                    currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
                    fragmentScorer.startFragment(currentFrag);
                    docFrags.add(currentFrag);
                }
            }
            tokenGroup.addToken(fragmentScorer.getTokenScore());
        //        if(lastEndOffset>maxDocBytesToAnalyze)
        //        {
        //          break;
        //        }
        }
        currentFrag.setScore(fragmentScorer.getFragmentScore());
        if (tokenGroup.getNumTokens() > 0) {
            //flush the accumulated text (same code as in above loop)
            startOffset = tokenGroup.getStartOffset();
            endOffset = tokenGroup.getEndOffset();
            tokenText = text.substring(startOffset, endOffset);
            String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
            //store any whitespace etc from between this and last group
            if (startOffset > lastEndOffset)
                newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
            newText.append(markedUpText);
            lastEndOffset = Math.max(lastEndOffset, endOffset);
        }
        //Test what remains of the original text beyond the point where we stopped analyzing
        if (//          if there is text beyond the last token considered..
        (lastEndOffset < text.length()) && //          and that text is not too large...
        (text.length() <= maxDocCharsToAnalyze)) {
            //append it to the last fragment
            newText.append(encoder.encodeText(text.substring(lastEndOffset)));
        }
        currentFrag.textEndPos = newText.length();
        //sort the most relevant sections of the text
        for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext(); ) {
            currentFrag = i.next();
            //If you are running with a version of Lucene before 11th Sept 03
            // you do not have PriorityQueue.insert() - so uncomment the code below
            /*
                  if (currentFrag.getScore() >= minScore)
                  {
                    fragQueue.put(currentFrag);
                    if (fragQueue.size() > maxNumFragments)
                    { // if hit queue overfull
                      fragQueue.pop(); // remove lowest in hit queue
                      minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                    }


                  }
        */
            //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
            //fix to PriorityQueue. The correct method to use here is the new "insert" method
            // USE ABOVE CODE IF THIS DOES NOT COMPILE!
            fragQueue.insertWithOverflow(currentFrag);
        }
        //return the most relevant fragments
        TextFragment[] frag = new TextFragment[fragQueue.size()];
        for (int i = frag.length - 1; i >= 0; i--) {
            frag[i] = fragQueue.pop();
        }
        //merge any contiguous fragments to improve readability
        if (mergeContiguousFragments) {
            mergeContiguousFragments(frag);
            ArrayList<TextFragment> fragTexts = new ArrayList<>();
            for (int i = 0; i < frag.length; i++) {
                if ((frag[i] != null) && (frag[i].getScore() > 0)) {
                    fragTexts.add(frag[i]);
                }
            }
            frag = fragTexts.toArray(new TextFragment[0]);
        }
        return frag;
    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

Also used : TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) IOException(java.io.IOException) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 15 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class MemoryIndex method storeTerms.

private void storeTerms(Info info, TokenStream tokenStream, int positionIncrementGap, int offsetGap) {
    int pos = -1;
    int offset = 0;
    if (info.numTokens > 0) {
        pos = info.lastPosition + positionIncrementGap;
        offset = info.lastOffset + offsetGap;
    }
    try (TokenStream stream = tokenStream) {
        TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
        PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
        OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
        PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null;
        stream.reset();
        while (stream.incrementToken()) {
            //        if (DEBUG) System.err.println("token='" + term + "'");
            info.numTokens++;
            final int posIncr = posIncrAttribute.getPositionIncrement();
            if (posIncr == 0) {
                info.numOverlapTokens++;
            }
            pos += posIncr;
            int ord = info.terms.add(termAtt.getBytesRef());
            if (ord < 0) {
                ord = (-ord) - 1;
                postingsWriter.reset(info.sliceArray.end[ord]);
            } else {
                info.sliceArray.start[ord] = postingsWriter.startNewSlice();
            }
            info.sliceArray.freq[ord]++;
            info.sumTotalTermFreq++;
            postingsWriter.writeInt(pos);
            if (storeOffsets) {
                postingsWriter.writeInt(offsetAtt.startOffset() + offset);
                postingsWriter.writeInt(offsetAtt.endOffset() + offset);
            }
            if (storePayloads) {
                final BytesRef payload = payloadAtt.getPayload();
                final int pIndex;
                if (payload == null || payload.length == 0) {
                    pIndex = -1;
                } else {
                    pIndex = payloadsBytesRefs.append(payload);
                }
                postingsWriter.writeInt(pIndex);
            }
            info.sliceArray.end[ord] = postingsWriter.getCurrentOffset();
        }
        stream.end();
        if (info.numTokens > 0) {
            info.lastPosition = pos;
            info.lastOffset = offsetAtt.endOffset() + offset;
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

Also used : TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IOException(java.io.IOException) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Aggregations

OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)55 TokenStream (org.apache.lucene.analysis.TokenStream)37 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)35 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)26 StringReader (java.io.StringReader)22 IOException (java.io.IOException)16 ArrayList (java.util.ArrayList)14 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)12 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 Token (org.apache.lucene.analysis.Token)7 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)7 Analyzer (org.apache.lucene.analysis.Analyzer)6 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)6 List (java.util.List)5 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 IndexReader (org.apache.lucene.index.IndexReader)5 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)4 Document (org.apache.lucene.document.Document)4