Search in sources :

Example 41 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.

the class AnalysisRequestHandlerBase method analyzeTokenStream.

/**
   * Analyzes the given TokenStream, collecting the Tokens it produces.
   *
   * @param tokenStream TokenStream to analyze
   *
   * @return List of tokens produced from the TokenStream
   */
private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) {
    final List<AttributeSource> tokens = new ArrayList<>();
    final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class);
    // for backwards compatibility, add all "common" attributes
    tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(TypeAttribute.class);
    try {
        tokenStream.reset();
        int position = 0;
        while (tokenStream.incrementToken()) {
            position += posIncrAtt.getPositionIncrement();
            trackerAtt.setActPosition(position);
            tokens.add(tokenStream.cloneAttributes());
        }
        // TODO should we capture?
        tokenStream.end();
    } catch (IOException ioe) {
        throw new RuntimeException("Error occured while iterating over tokenstream", ioe);
    } finally {
        IOUtils.closeWhileHandlingException(tokenStream);
    }
    return tokens;
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) ArrayList(java.util.ArrayList) IOException(java.io.IOException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 42 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.

the class TestStopAnalyzer method testStopListPositions.

public void testStopListPositions() throws IOException {
    CharArraySet stopWordsSet = new CharArraySet(asSet("good", "test", "analyzer"), false);
    StopAnalyzer newStop = new StopAnalyzer(stopWordsSet);
    String s = "This is a good test of the english stop analyzer with positions";
    int[] expectedIncr = { 1, 1, 1, 3, 1, 1, 1, 2, 1 };
    try (TokenStream stream = newStop.tokenStream("test", s)) {
        assertNotNull(stream);
        int i = 0;
        CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String text = termAtt.toString();
            assertFalse(stopWordsSet.contains(text));
            assertEquals(expectedIncr[i++], posIncrAtt.getPositionIncrement());
        }
        stream.end();
    }
    newStop.close();
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 43 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.

the class QueryBuilder method analyzeMultiBoolean.

/** 
   * Creates complex boolean query from the cached tokenstream contents 
   */
protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException {
    BooleanQuery.Builder q = newBooleanQuery();
    List<Term> currentQuery = new ArrayList<>();
    TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
    PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        if (posIncrAtt.getPositionIncrement() != 0) {
            add(q, currentQuery, operator);
            currentQuery.clear();
        }
        currentQuery.add(new Term(field, termAtt.getBytesRef()));
    }
    add(q, currentQuery, operator);
    return q.build();
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 44 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.

the class QueryBuilder method createFieldQuery.

/**
   * Creates a query from a token stream.
   *
   * @param source     the token stream to create the query from
   * @param operator   default boolean operator used for this query
   * @param field      field to create queries against
   * @param quoted     true if phrases should be generated when terms occur at more than one position
   * @param phraseSlop slop factor for phrase/multiphrase queries
   */
protected Query createFieldQuery(TokenStream source, BooleanClause.Occur operator, String field, boolean quoted, int phraseSlop) {
    assert operator == BooleanClause.Occur.SHOULD || operator == BooleanClause.Occur.MUST;
    // Build an appropriate query based on the analysis chain.
    try (CachingTokenFilter stream = new CachingTokenFilter(source)) {
        TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
        PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
        PositionLengthAttribute posLenAtt = stream.addAttribute(PositionLengthAttribute.class);
        if (termAtt == null) {
            return null;
        }
        // phase 1: read through the stream and assess the situation:
        // counting the number of tokens/positions and marking if we have any synonyms.
        int numTokens = 0;
        int positionCount = 0;
        boolean hasSynonyms = false;
        boolean isGraph = false;
        stream.reset();
        while (stream.incrementToken()) {
            numTokens++;
            int positionIncrement = posIncAtt.getPositionIncrement();
            if (positionIncrement != 0) {
                positionCount += positionIncrement;
            } else {
                hasSynonyms = true;
            }
            int positionLength = posLenAtt.getPositionLength();
            if (enableGraphQueries && positionLength > 1) {
                isGraph = true;
            }
        }
        if (numTokens == 0) {
            return null;
        } else if (numTokens == 1) {
            // single term
            return analyzeTerm(field, stream);
        } else if (isGraph) {
            // graph
            if (quoted) {
                return analyzeGraphPhrase(stream, field, phraseSlop);
            } else {
                return analyzeGraphBoolean(field, stream, operator);
            }
        } else if (quoted && positionCount > 1) {
            // phrase
            if (hasSynonyms) {
                // complex phrase with synonyms
                return analyzeMultiPhrase(field, stream, phraseSlop);
            } else {
                // simple phrase
                return analyzePhrase(field, stream, phraseSlop);
            }
        } else {
            // boolean
            if (positionCount == 1) {
                // only one position, with synonyms
                return analyzeBoolean(field, stream);
            } else {
                // complex case: multiple positions
                return analyzeMultiBoolean(field, stream, operator);
            }
        }
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing query text", e);
    }
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) CachingTokenFilter(org.apache.lucene.analysis.CachingTokenFilter) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) IOException(java.io.IOException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 45 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.

the class TestIndexWriterExceptions method testTooManyTokens.

// kind of slow, but omits positions, so just CPU
@Nightly
public void testTooManyTokens() throws Exception {
    Directory dir = newDirectory();
    IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
    Document doc = new Document();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    doc.add(new Field("foo", new TokenStream() {

        CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

        PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);

        long num = 0;

        @Override
        public boolean incrementToken() throws IOException {
            if (num == Integer.MAX_VALUE + 1) {
                return false;
            }
            clearAttributes();
            if (num == 0) {
                posIncAtt.setPositionIncrement(1);
            } else {
                posIncAtt.setPositionIncrement(0);
            }
            termAtt.append("a");
            num++;
            if (VERBOSE && num % 1000000 == 0) {
                System.out.println("indexed: " + num);
            }
            return true;
        }
    }, ft));
    IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
        iw.addDocument(doc);
    });
    assertTrue(expected.getMessage().contains("too many tokens"));
    iw.close();
    dir.close();
}
Also used : StringField(org.apache.lucene.document.StringField) SortedNumericDocValuesField(org.apache.lucene.document.SortedNumericDocValuesField) StoredField(org.apache.lucene.document.StoredField) NumericDocValuesField(org.apache.lucene.document.NumericDocValuesField) SortedSetDocValuesField(org.apache.lucene.document.SortedSetDocValuesField) BinaryDocValuesField(org.apache.lucene.document.BinaryDocValuesField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Document(org.apache.lucene.document.Document) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) FieldType(org.apache.lucene.document.FieldType) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Aggregations

PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)50 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)33 TokenStream (org.apache.lucene.analysis.TokenStream)28 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)25 IOException (java.io.IOException)14 ArrayList (java.util.ArrayList)14 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)11 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)11 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)11 StringReader (java.io.StringReader)8 Term (org.apache.lucene.index.Term)8 Token (org.apache.lucene.analysis.Token)7 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)7 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)7 List (java.util.List)6 LinkedList (java.util.LinkedList)4 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)4 Document (org.apache.lucene.document.Document)4 Iterator (java.util.Iterator)3