Search in sources :

Example 11 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.

the class GraphTokenStreamFiniteStrings method build.

/**
   * Build an automaton from the provided {@link TokenStream}.
   */
private Automaton build(final TokenStream in) throws IOException {
    Automaton.Builder builder = new Automaton.Builder();
    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
    in.reset();
    int pos = -1;
    int prevIncr = 1;
    int state = -1;
    while (in.incrementToken()) {
        int currentIncr = posIncAtt.getPositionIncrement();
        if (pos == -1 && currentIncr < 1) {
            throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1");
        }
        // always use inc 1 while building, but save original increment
        int incr = Math.min(1, currentIncr);
        if (incr > 0) {
            pos += incr;
        }
        int endPos = pos + posLengthAtt.getPositionLength();
        while (state < endPos) {
            state = builder.createState();
        }
        BytesRef term = termBytesAtt.getBytesRef();
        int id = getTermID(currentIncr, prevIncr, term);
        builder.addTransition(pos, endPos, id);
        // only save last increment on non-zero increment in case we have multiple stacked tokens
        if (currentIncr > 0) {
            prevIncr = currentIncr;
        }
    }
    in.end();
    if (state != -1) {
        builder.setAccept(state, true);
    }
    return builder.finish();
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) Automaton(org.apache.lucene.util.automaton.Automaton) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 12 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.

the class TestPositionIncrement method testSetPosition.

public void testSetPosition() throws Exception {
    Analyzer analyzer = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(new Tokenizer() {

                // TODO: use CannedTokenStream
                private final String[] TOKENS = { "1", "2", "3", "4", "5" };

                private final int[] INCREMENTS = { 1, 2, 1, 0, 1 };

                private int i = 0;

                PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

                CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

                OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

                @Override
                public boolean incrementToken() {
                    if (i == TOKENS.length)
                        return false;
                    clearAttributes();
                    termAtt.append(TOKENS[i]);
                    offsetAtt.setOffset(i, i);
                    posIncrAtt.setPositionIncrement(INCREMENTS[i]);
                    i++;
                    return true;
                }

                @Override
                public void reset() throws IOException {
                    super.reset();
                    this.i = 0;
                }
            });
        }
    };
    Directory store = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), store, analyzer);
    Document d = new Document();
    d.add(newTextField("field", "bogus", Field.Store.YES));
    writer.addDocument(d);
    IndexReader reader = writer.getReader();
    writer.close();
    IndexSearcher searcher = newSearcher(reader);
    PostingsEnum pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("1"));
    pos.nextDoc();
    // first token should be at position 0
    assertEquals(0, pos.nextPosition());
    pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("2"));
    pos.nextDoc();
    // second token should be at position 2
    assertEquals(2, pos.nextPosition());
    PhraseQuery q;
    ScoreDoc[] hits;
    q = new PhraseQuery("field", "1", "2");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // same as previous, using the builder with implicit positions
    PhraseQuery.Builder builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"));
    builder.add(new Term("field", "2"));
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // same as previous, just specify positions explicitely.
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"), 0);
    builder.add(new Term("field", "2"), 1);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // specifying correct positions should find the phrase.
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"), 0);
    builder.add(new Term("field", "2"), 2);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "2", "3");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "3", "4");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // phrase query would find it when correct positions are specified. 
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "3"), 0);
    builder.add(new Term("field", "4"), 0);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    // phrase query should fail for non existing searched term 
    // even if there exist another searched terms in the same searched position. 
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "3"), 0);
    builder.add(new Term("field", "9"), 0);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // multi-phrase query should succed for non existing searched term
    // because there exist another searched terms in the same searched position. 
    MultiPhraseQuery.Builder mqb = new MultiPhraseQuery.Builder();
    mqb.add(new Term[] { new Term("field", "3"), new Term("field", "9") }, 0);
    hits = searcher.search(mqb.build(), 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "2", "4");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "3", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "4", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "2", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    reader.close();
    store.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) MockPayloadAnalyzer(org.apache.lucene.analysis.MockPayloadAnalyzer) Document(org.apache.lucene.document.Document) PostingsEnum(org.apache.lucene.index.PostingsEnum) Tokenizer(org.apache.lucene.analysis.Tokenizer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) IOException(java.io.IOException) Term(org.apache.lucene.index.Term) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IndexReader(org.apache.lucene.index.IndexReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter)

Example 13 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.

the class TestDocumentWriter method testTokenReuse.

public void testTokenReuse() throws IOException {
    Analyzer analyzer = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new TokenFilter(tokenizer) {

                boolean first = true;

                AttributeSource.State state;

                @Override
                public boolean incrementToken() throws IOException {
                    if (state != null) {
                        restoreState(state);
                        payloadAtt.setPayload(null);
                        posIncrAtt.setPositionIncrement(0);
                        termAtt.setEmpty().append("b");
                        state = null;
                        return true;
                    }
                    boolean hasNext = input.incrementToken();
                    if (!hasNext)
                        return false;
                    if (Character.isDigit(termAtt.buffer()[0])) {
                        posIncrAtt.setPositionIncrement(termAtt.buffer()[0] - '0');
                    }
                    if (first) {
                        // set payload on first position only
                        payloadAtt.setPayload(new BytesRef(new byte[] { 100 }));
                        first = false;
                    }
                    // index a "synonym" for every token
                    state = captureState();
                    return true;
                }

                @Override
                public void reset() throws IOException {
                    super.reset();
                    first = true;
                    state = null;
                }

                final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

                final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);

                final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
            });
        }
    };
    IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer));
    Document doc = new Document();
    doc.add(newTextField("f1", "a 5 a a", Field.Store.YES));
    writer.addDocument(doc);
    writer.commit();
    SegmentCommitInfo info = writer.newestSegment();
    writer.close();
    SegmentReader reader = new SegmentReader(info, Version.LATEST.major, newIOContext(random()));
    PostingsEnum termPositions = MultiFields.getTermPositionsEnum(reader, "f1", new BytesRef("a"));
    assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    int freq = termPositions.freq();
    assertEquals(3, freq);
    assertEquals(0, termPositions.nextPosition());
    assertNotNull(termPositions.getPayload());
    assertEquals(6, termPositions.nextPosition());
    assertNull(termPositions.getPayload());
    assertEquals(7, termPositions.nextPosition());
    assertNull(termPositions.getPayload());
    reader.close();
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) AttributeSource(org.apache.lucene.util.AttributeSource) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Example 14 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.

the class TestTypeTokenFilter method testPositons.

private void testPositons(TypeTokenFilter stpf) throws IOException {
    TypeAttribute typeAtt = stpf.getAttribute(TypeAttribute.class);
    CharTermAttribute termAttribute = stpf.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
    stpf.reset();
    while (stpf.incrementToken()) {
        log("Token: " + termAttribute.toString() + ": " + typeAtt.type() + " - " + posIncrAtt.getPositionIncrement());
        assertEquals("if position increment is enabled the positionIncrementAttribute value should be 3, otherwise 1", posIncrAtt.getPositionIncrement(), 3);
    }
    stpf.end();
    stpf.close();
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 15 with PositionIncrementAttribute

use of org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute in project lucene-solr by apache.

the class TestTeeSinkTokenFilter method performance.

/**
   * Not an explicit test, just useful to print out some info on performance
   */
@SuppressWarnings("resource")
public void performance() throws Exception {
    int[] tokCount = { 100, 500, 1000, 2000, 5000, 10000 };
    int[] modCounts = { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
    for (int k = 0; k < tokCount.length; k++) {
        StringBuilder buffer = new StringBuilder();
        System.out.println("-----Tokens: " + tokCount[k] + "-----");
        for (int i = 0; i < tokCount[k]; i++) {
            buffer.append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).append(' ');
        }
        //make sure we produce the same tokens
        TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(standardTokenizer(buffer)));
        TokenStream sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), 100);
        teeStream.consumeAllTokens();
        TokenStream stream = new ModuloTokenFilter(new StandardFilter(standardTokenizer(buffer)), 100);
        CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class);
        CharTermAttribute sinkTok = sink.addAttribute(CharTermAttribute.class);
        for (int i = 0; stream.incrementToken(); i++) {
            assertTrue(sink.incrementToken());
            assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
        }
        //simulate two fields, each being analyzed once, for 20 documents
        for (int j = 0; j < modCounts.length; j++) {
            int tfPos = 0;
            long start = System.currentTimeMillis();
            for (int i = 0; i < 20; i++) {
                stream = new StandardFilter(standardTokenizer(buffer));
                PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
                while (stream.incrementToken()) {
                    tfPos += posIncrAtt.getPositionIncrement();
                }
                stream = new ModuloTokenFilter(new StandardFilter(standardTokenizer(buffer)), modCounts[j]);
                posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
                while (stream.incrementToken()) {
                    tfPos += posIncrAtt.getPositionIncrement();
                }
            }
            long finish = System.currentTimeMillis();
            System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
            int sinkPos = 0;
            //simulate one field with one sink
            start = System.currentTimeMillis();
            for (int i = 0; i < 20; i++) {
                teeStream = new TeeSinkTokenFilter(new StandardFilter(standardTokenizer(buffer)));
                sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), modCounts[j]);
                PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class);
                while (teeStream.incrementToken()) {
                    sinkPos += posIncrAtt.getPositionIncrement();
                }
                //System.out.println("Modulo--------");
                posIncrAtt = sink.getAttribute(PositionIncrementAttribute.class);
                while (sink.incrementToken()) {
                    sinkPos += posIncrAtt.getPositionIncrement();
                }
            }
            finish = System.currentTimeMillis();
            System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
            assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
        }
        System.out.println("- End Tokens: " + tokCount[k] + "-----");
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Aggregations

PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)50 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)33 TokenStream (org.apache.lucene.analysis.TokenStream)28 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)25 IOException (java.io.IOException)14 ArrayList (java.util.ArrayList)14 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)11 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)11 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)11 StringReader (java.io.StringReader)8 Term (org.apache.lucene.index.Term)8 Token (org.apache.lucene.analysis.Token)7 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)7 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)7 List (java.util.List)6 LinkedList (java.util.LinkedList)4 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)4 Document (org.apache.lucene.document.Document)4 Iterator (java.util.Iterator)3