Search in sources :

Example 6 with PositionLengthAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute in project lucene-solr by apache.

the class QueryBuilder method createFieldQuery.

/**
   * Creates a query from a token stream.
   *
   * @param source     the token stream to create the query from
   * @param operator   default boolean operator used for this query
   * @param field      field to create queries against
   * @param quoted     true if phrases should be generated when terms occur at more than one position
   * @param phraseSlop slop factor for phrase/multiphrase queries
   */
protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operator, String field, boolean quoted, int phraseSlop) {
    assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
    // Build an appropriate query based on the analysis chain.
    try (CachingTokenFilter stream = new CachingTokenFilter(source)) {
        TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
        PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
        PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class);
        if (termAtt == null) {
            return null;
        }
        // phase 1: read through the stream and assess the situation:
        // counting the number of tokens/positions and marking if we have any synonyms.
        int numTokens = 0;
        int positionCount = 0;
        boolean hasSynonyms = false;
        boolean isGraph = false;
        stream.reset();
        while (stream.incrementToken()) {
            numTokens++;
            int positionIncrement = posIncAtt.getPositionIncrement();
            if (positionIncrement != 0) {
                positionCount += positionIncrement;
            } else {
                hasSynonyms = true;
            }
            int positionLength = posLenAtt.getPositionLength();
            if (enableGraphQueries && positionLength > 1) {
                isGraph = true;
            }
        }
        if (numTokens == 0) {
            return null;
        } else if (numTokens == 1) {
            // single term
            return analyzeTerm(field, stream);
        } else if (isGraph) {
            // graph
            if (quoted) {
                return analyzeGraphPhrase(stream, field, phraseSlop);
            } else {
                return analyzeGraphBoolean(field, stream, operator);
            }
        } else if (quoted && positionCount > 1) {
            // phrase
            if (hasSynonyms) {
                // complex phrase with synonyms
                return analyzeMultiPhrase(field, stream, phraseSlop);
            } else {
                // simple phrase
                return analyzePhrase(field, stream, phraseSlop);
            }
        } else {
            // boolean
            if (positionCount == 1) {
                // only one position, with synonyms
                return analyzeBoolean(field, stream);
            } else {
                // complex case: multiple positions
                return analyzeMultiBoolean(field, stream, operator);
            }
        }
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing query text", e);
    }
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) CachingTokenFilter(org.apache.lucene.analysis.CachingTokenFilter) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) IOException(java.io.IOException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 7 with PositionLengthAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute in project lucene-solr by apache.

the class NGramTokenizerTest method testNGrams.

static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException {
    // convert the string to code points
    final int[] codePoints = toCodePoints(s);
    final int[] offsets = new int[codePoints.length + 1];
    for (int i = 0; i < codePoints.length; ++i) {
        offsets[i + 1] = offsets[i] + Character.charCount(codePoints[i]);
    }
    final Tokenizer grams = new NGramTokenizer(minGram, maxGram, edgesOnly) {

        @Override
        protected boolean isTokenChar(int chr) {
            return nonTokenChars.indexOf(chr) < 0;
        }
    };
    grams.setReader(new StringReader(s));
    final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
    final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
    grams.reset();
    for (int start = 0; start < codePoints.length; ++start) {
        nextGram: for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
            if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
                // not on an edge
                continue nextGram;
            }
            for (int j = start; j < end; ++j) {
                if (!isTokenChar(nonTokenChars, codePoints[j])) {
                    continue nextGram;
                }
            }
            assertTrue(grams.incrementToken());
            assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt));
            assertEquals(1, posIncAtt.getPositionIncrement());
            assertEquals(1, posLenAtt.getPositionLength());
            assertEquals(offsets[start], offsetAtt.startOffset());
            assertEquals(offsets[end], offsetAtt.endOffset());
        }
    }
    assertFalse(grams.incrementToken());
    grams.end();
    assertEquals(s.length(), offsetAtt.startOffset());
    assertEquals(s.length(), offsetAtt.endOffset());
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Tokenizer(org.apache.lucene.analysis.Tokenizer) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Aggregations

PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)7 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)7 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)5 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)5 BytesRef (org.apache.lucene.util.BytesRef)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)2 TokenStream (org.apache.lucene.analysis.TokenStream)2 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)2 Automaton (org.apache.lucene.util.automaton.Automaton)2 StringReader (java.io.StringReader)1 HashSet (java.util.HashSet)1 CachingTokenFilter (org.apache.lucene.analysis.CachingTokenFilter)1 Tokenizer (org.apache.lucene.analysis.Tokenizer)1 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)1 CodecUtil (org.apache.lucene.codecs.CodecUtil)1 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)1 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)1 IntsRef (org.apache.lucene.util.IntsRef)1 IntsRefBuilder (org.apache.lucene.util.IntsRefBuilder)1