Search in sources :

Example 21 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class TokenStreamToAutomaton method toAutomaton.

/** Pulls the graph (including {@link
   *  PositionLengthAttribute}) from the provided {@link
   *  TokenStream}, and creates the corresponding
   *  automaton where arcs are bytes (or Unicode code points 
   *  if unicodeArcs = true) from each term. */
public Automaton toAutomaton(TokenStream in) throws IOException {
    final Automaton.Builder builder = new Automaton.Builder();
    builder.createState();
    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
    final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
    in.reset();
    // Only temporarily holds states ahead of our current
    // position:
    final RollingBuffer<Position> positions = new Positions();
    int pos = -1;
    int freedPos = 0;
    Position posData = null;
    int maxOffset = 0;
    while (in.incrementToken()) {
        int posInc = posIncAtt.getPositionIncrement();
        if (preservePositionIncrements == false && posInc > 1) {
            posInc = 1;
        }
        assert pos > -1 || posInc > 0;
        if (posInc > 0) {
            // New node:
            pos += posInc;
            posData = positions.get(pos);
            assert posData.leaving == -1;
            if (posData.arriving == -1) {
                // No token ever arrived to this position
                if (pos == 0) {
                    // OK: this is the first token
                    posData.leaving = 0;
                } else {
                    // This means there's a hole (eg, StopFilter
                    // does this):
                    posData.leaving = builder.createState();
                    addHoles(builder, positions, pos);
                }
            } else {
                posData.leaving = builder.createState();
                builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
                if (posInc > 1) {
                    // A token spanned over a hole; add holes
                    // "under" it:
                    addHoles(builder, positions, pos);
                }
            }
            while (freedPos <= pos) {
                Position freePosData = positions.get(freedPos);
                // don't free this position yet if we may still need to fill holes over it:
                if (freePosData.arriving == -1 || freePosData.leaving == -1) {
                    break;
                }
                positions.freeBefore(freedPos);
                freedPos++;
            }
        }
        final int endPos = pos + posLengthAtt.getPositionLength();
        final BytesRef termUTF8 = changeToken(termBytesAtt.getBytesRef());
        int[] termUnicode = null;
        final Position endPosData = positions.get(endPos);
        if (endPosData.arriving == -1) {
            endPosData.arriving = builder.createState();
        }
        int termLen;
        if (unicodeArcs) {
            final String utf16 = termUTF8.utf8ToString();
            termUnicode = new int[utf16.codePointCount(0, utf16.length())];
            termLen = termUnicode.length;
            for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
                termUnicode[j++] = cp = utf16.codePointAt(i);
            }
        } else {
            termLen = termUTF8.length;
        }
        int state = posData.leaving;
        for (int byteIDX = 0; byteIDX < termLen; byteIDX++) {
            final int nextState = byteIDX == termLen - 1 ? endPosData.arriving : builder.createState();
            int c;
            if (unicodeArcs) {
                c = termUnicode[byteIDX];
            } else {
                c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
            }
            builder.addTransition(state, nextState, c);
            state = nextState;
        }
        maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
    }
    in.end();
    int endState = -1;
    int endPosInc = posIncAtt.getPositionIncrement();
    if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
        endPosInc = 1;
    }
    if (endPosInc > 0) {
        // there were hole(s) after the last token
        endState = builder.createState();
        // add trailing holes now:
        int lastState = endState;
        while (true) {
            int state1 = builder.createState();
            builder.addTransition(lastState, state1, HOLE);
            endPosInc--;
            if (endPosInc == 0) {
                builder.setAccept(state1, true);
                break;
            }
            int state2 = builder.createState();
            builder.addTransition(state1, state2, POS_SEP);
            lastState = state2;
        }
    } else {
        endState = -1;
    }
    pos++;
    while (pos <= positions.getMaxPos()) {
        posData = positions.get(pos);
        if (posData.arriving != -1) {
            if (endState != -1) {
                builder.addTransition(posData.arriving, endState, POS_SEP);
            } else {
                builder.setAccept(posData.arriving, true);
            }
        }
        pos++;
    }
    return builder.finish();
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) Automaton(org.apache.lucene.util.automaton.Automaton) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Example 22 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class QueryBuilder method analyzeTerm.

/** 
   * Creates simple term query from the cached tokenstream contents 
   */
protected Query analyzeTerm(String field, TokenStream stream) throws IOException {
    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    stream.reset();
    if (!stream.incrementToken()) {
        throw new AssertionError();
    }
    return newTermQuery(new Term(field, termAtt.getBytesRef()));
}
Also used : TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) Term(org.apache.lucene.index.Term)

Example 23 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class QueryBuilder method createSpanQuery.

/**
   * Creates a span query from the tokenstream.  In the case of a single token, a simple <code>SpanTermQuery</code> is
   * returned.  When multiple tokens, an ordered <code>SpanNearQuery</code> with slop of 0 is returned.
   */
protected final SpanQuery createSpanQuery(TokenStream in, String field) throws IOException {
    TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class);
    if (termAtt == null) {
        return null;
    }
    List<SpanTermQuery> terms = new ArrayList<>();
    while (in.incrementToken()) {
        terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef())));
    }
    if (terms.isEmpty()) {
        return null;
    } else if (terms.size() == 1) {
        return terms.get(0);
    } else {
        return new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true);
    }
}
Also used : SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery)

Example 24 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class QueryBuilder method analyzeMultiBoolean.

/** 
   * Creates complex boolean query from the cached tokenstream contents 
   */
protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException {
    BooleanQuery.Builder q = newBooleanQuery();
    List<Term> currentQuery = new ArrayList<>();
    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        if (posIncrAtt.getPositionIncrement() != 0) {
            add(q, currentQuery, operator);
            currentQuery.clear();
        }
        currentQuery.add(new Term(field, termAtt.getBytesRef()));
    }
    add(q, currentQuery, operator);
    return q.build();
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 25 with TermToBytesRefAttribute

use of org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute in project lucene-solr by apache.

the class QueryBuilder method createFieldQuery.

/**
   * Creates a query from a token stream.
   *
   * @param source     the token stream to create the query from
   * @param operator   default boolean operator used for this query
   * @param field      field to create queries against
   * @param quoted     true if phrases should be generated when terms occur at more than one position
   * @param phraseSlop slop factor for phrase/multiphrase queries
   */
protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operator, String field, boolean quoted, int phraseSlop) {
    assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
    // Build an appropriate query based on the analysis chain.
    try (CachingTokenFilter stream = new CachingTokenFilter(source)) {
        TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
        PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
        PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class);
        if (termAtt == null) {
            return null;
        }
        // phase 1: read through the stream and assess the situation:
        // counting the number of tokens/positions and marking if we have any synonyms.
        int numTokens = 0;
        int positionCount = 0;
        boolean hasSynonyms = false;
        boolean isGraph = false;
        stream.reset();
        while (stream.incrementToken()) {
            numTokens++;
            int positionIncrement = posIncAtt.getPositionIncrement();
            if (positionIncrement != 0) {
                positionCount += positionIncrement;
            } else {
                hasSynonyms = true;
            }
            int positionLength = posLenAtt.getPositionLength();
            if (enableGraphQueries && positionLength > 1) {
                isGraph = true;
            }
        }
        if (numTokens == 0) {
            return null;
        } else if (numTokens == 1) {
            // single term
            return analyzeTerm(field, stream);
        } else if (isGraph) {
            // graph
            if (quoted) {
                return analyzeGraphPhrase(stream, field, phraseSlop);
            } else {
                return analyzeGraphBoolean(field, stream, operator);
            }
        } else if (quoted && positionCount > 1) {
            // phrase
            if (hasSynonyms) {
                // complex phrase with synonyms
                return analyzeMultiPhrase(field, stream, phraseSlop);
            } else {
                // simple phrase
                return analyzePhrase(field, stream, phraseSlop);
            }
        } else {
            // boolean
            if (positionCount == 1) {
                // only one position, with synonyms
                return analyzeBoolean(field, stream);
            } else {
                // complex case: multiple positions
                return analyzeMultiBoolean(field, stream, operator);
            }
        }
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing query text", e);
    }
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) CachingTokenFilter(org.apache.lucene.analysis.CachingTokenFilter) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) IOException(java.io.IOException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Aggregations

TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)34 BytesRef (org.apache.lucene.util.BytesRef)17 TokenStream (org.apache.lucene.analysis.TokenStream)16 IOException (java.io.IOException)13 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)12 ArrayList (java.util.ArrayList)10 Term (org.apache.lucene.index.Term)9 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)6 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)4 Analyzer (org.apache.lucene.analysis.Analyzer)3 CachingTokenFilter (org.apache.lucene.analysis.CachingTokenFilter)3 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)3 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)3 SpanTermQuery (org.apache.lucene.search.spans.SpanTermQuery)3 StringReader (java.io.StringReader)2 HashSet (java.util.HashSet)2 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)2 BooleanQuery (org.apache.lucene.search.BooleanQuery)2 MultiPhraseQuery (org.apache.lucene.search.MultiPhraseQuery)2