Search in sources :

Example 51 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class TokenStreamToAutomaton method toAutomaton.

/** Pulls the graph (including {@link
   *  PositionLengthAttribute}) from the provided {@link
   *  TokenStream}, and creates the corresponding
   *  automaton where arcs are bytes (or Unicode code points 
   *  if unicodeArcs = true) from each term. */
public Automaton toAutomaton(TokenStream in) throws IOException {
    final Automaton.Builder builder = new Automaton.Builder();
    builder.createState();
    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
    final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
    in.reset();
    // Only temporarily holds states ahead of our current
    // position:
    final RollingBuffer<Position> positions = new Positions();
    int pos = -1;
    int freedPos = 0;
    Position posData = null;
    int maxOffset = 0;
    while (in.incrementToken()) {
        int posInc = posIncAtt.getPositionIncrement();
        if (preservePositionIncrements == false && posInc > 1) {
            posInc = 1;
        }
        assert pos > -1 || posInc > 0;
        if (posInc > 0) {
            // New node:
            pos += posInc;
            posData = positions.get(pos);
            assert posData.leaving == -1;
            if (posData.arriving == -1) {
                // No token ever arrived to this position
                if (pos == 0) {
                    // OK: this is the first token
                    posData.leaving = 0;
                } else {
                    // This means there's a hole (eg, StopFilter
                    // does this):
                    posData.leaving = builder.createState();
                    addHoles(builder, positions, pos);
                }
            } else {
                posData.leaving = builder.createState();
                builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
                if (posInc > 1) {
                    // A token spanned over a hole; add holes
                    // "under" it:
                    addHoles(builder, positions, pos);
                }
            }
            while (freedPos <= pos) {
                Position freePosData = positions.get(freedPos);
                // don't free this position yet if we may still need to fill holes over it:
                if (freePosData.arriving == -1 || freePosData.leaving == -1) {
                    break;
                }
                positions.freeBefore(freedPos);
                freedPos++;
            }
        }
        final int endPos = pos + posLengthAtt.getPositionLength();
        final BytesRef termUTF8 = changeToken(termBytesAtt.getBytesRef());
        int[] termUnicode = null;
        final Position endPosData = positions.get(endPos);
        if (endPosData.arriving == -1) {
            endPosData.arriving = builder.createState();
        }
        int termLen;
        if (unicodeArcs) {
            final String utf16 = termUTF8.utf8ToString();
            termUnicode = new int[utf16.codePointCount(0, utf16.length())];
            termLen = termUnicode.length;
            for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
                termUnicode[j++] = cp = utf16.codePointAt(i);
            }
        } else {
            termLen = termUTF8.length;
        }
        int state = posData.leaving;
        for (int byteIDX = 0; byteIDX < termLen; byteIDX++) {
            final int nextState = byteIDX == termLen - 1 ? endPosData.arriving : builder.createState();
            int c;
            if (unicodeArcs) {
                c = termUnicode[byteIDX];
            } else {
                c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
            }
            builder.addTransition(state, nextState, c);
            state = nextState;
        }
        maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
    }
    in.end();
    int endState = -1;
    int endPosInc = posIncAtt.getPositionIncrement();
    if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
        endPosInc = 1;
    }
    if (endPosInc > 0) {
        // there were hole(s) after the last token
        endState = builder.createState();
        // add trailing holes now:
        int lastState = endState;
        while (true) {
            int state1 = builder.createState();
            builder.addTransition(lastState, state1, HOLE);
            endPosInc--;
            if (endPosInc == 0) {
                builder.setAccept(state1, true);
                break;
            }
            int state2 = builder.createState();
            builder.addTransition(state1, state2, POS_SEP);
            lastState = state2;
        }
    } else {
        endState = -1;
    }
    pos++;
    while (pos <= positions.getMaxPos()) {
        posData = positions.get(pos);
        if (posData.arriving != -1) {
            if (endState != -1) {
                builder.addTransition(posData.arriving, endState, POS_SEP);
            } else {
                builder.setAccept(posData.arriving, true);
            }
        }
        pos++;
    }
    return builder.finish();
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) Automaton(org.apache.lucene.util.automaton.Automaton) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Example 52 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class SynonymTokenizer method getTS2.

protected TokenStream getTS2() {
    // String s = "Hi-Speed10 foo";
    return new TokenStream() {

        Iterator<Token> iter;

        List<Token> lst;

        private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

        private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

        private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

        {
            lst = new ArrayList<>();
            Token t;
            t = createToken("hi", 0, 2);
            t.setPositionIncrement(1);
            lst.add(t);
            t = createToken("hispeed", 0, 8);
            t.setPositionIncrement(1);
            lst.add(t);
            t = createToken("speed", 3, 8);
            t.setPositionIncrement(0);
            lst.add(t);
            t = createToken("10", 8, 10);
            t.setPositionIncrement(1);
            lst.add(t);
            t = createToken("foo", 11, 14);
            t.setPositionIncrement(1);
            lst.add(t);
            iter = lst.iterator();
        }

        @Override
        public boolean incrementToken() {
            if (iter.hasNext()) {
                Token token = iter.next();
                clearAttributes();
                termAtt.setEmpty().append(token);
                posIncrAtt.setPositionIncrement(token.getPositionIncrement());
                offsetAtt.setOffset(token.startOffset(), token.endOffset());
                return true;
            }
            return false;
        }

        @Override
        public void reset() throws IOException {
            super.reset();
            iter = lst.iterator();
        }
    };
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Iterator(java.util.Iterator) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) List(java.util.List) ArrayList(java.util.ArrayList) NodeList(org.w3c.dom.NodeList) Token(org.apache.lucene.analysis.Token) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 53 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class SynonymTokenizer method getTS2a.

// same token-stream as above, but the bigger token comes first this time
protected TokenStream getTS2a() {
    // String s = "Hi-Speed10 foo";
    return new TokenStream() {

        Iterator<Token> iter;

        List<Token> lst;

        private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

        private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

        private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

        {
            lst = new ArrayList<>();
            Token t;
            t = createToken("hispeed", 0, 8);
            t.setPositionIncrement(1);
            lst.add(t);
            t = createToken("hi", 0, 2);
            t.setPositionIncrement(0);
            lst.add(t);
            t = createToken("speed", 3, 8);
            t.setPositionIncrement(1);
            lst.add(t);
            t = createToken("10", 8, 10);
            t.setPositionIncrement(1);
            lst.add(t);
            t = createToken("foo", 11, 14);
            t.setPositionIncrement(1);
            lst.add(t);
            iter = lst.iterator();
        }

        @Override
        public boolean incrementToken() {
            if (iter.hasNext()) {
                Token token = iter.next();
                clearAttributes();
                termAtt.setEmpty().append(token);
                posIncrAtt.setPositionIncrement(token.getPositionIncrement());
                offsetAtt.setOffset(token.startOffset(), token.endOffset());
                return true;
            }
            return false;
        }

        @Override
        public void reset() throws IOException {
            super.reset();
            iter = lst.iterator();
        }
    };
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Iterator(java.util.Iterator) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) List(java.util.List) ArrayList(java.util.ArrayList) NodeList(org.w3c.dom.NodeList) Token(org.apache.lucene.analysis.Token) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 54 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class NGramTokenizerTest method testNGrams.

static void testNGrams(int minGram, int maxGram, String s, final String nonTokenChars, boolean edgesOnly) throws IOException {
    // convert the string to code points
    final int[] codePoints = toCodePoints(s);
    final int[] offsets = new int[codePoints.length + 1];
    for (int i = 0; i < codePoints.length; ++i) {
        offsets[i + 1] = offsets[i] + Character.charCount(codePoints[i]);
    }
    final Tokenizer grams = new NGramTokenizer(minGram, maxGram, edgesOnly) {

        @Override
        protected boolean isTokenChar(int chr) {
            return nonTokenChars.indexOf(chr) < 0;
        }
    };
    grams.setReader(new StringReader(s));
    final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
    final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
    grams.reset();
    for (int start = 0; start < codePoints.length; ++start) {
        nextGram: for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
            if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
                // not on an edge
                continue nextGram;
            }
            for (int j = start; j < end; ++j) {
                if (!isTokenChar(nonTokenChars, codePoints[j])) {
                    continue nextGram;
                }
            }
            assertTrue(grams.incrementToken());
            assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt));
            assertEquals(1, posIncAtt.getPositionIncrement());
            assertEquals(1, posLenAtt.getPositionLength());
            assertEquals(offsets[start], offsetAtt.startOffset());
            assertEquals(offsets[end], offsetAtt.endOffset());
        }
    }
    assertFalse(grams.incrementToken());
    grams.end();
    assertEquals(s.length(), offsetAtt.startOffset());
    assertEquals(s.length(), offsetAtt.endOffset());
}
Also used : PositionLengthAttribute(org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Tokenizer(org.apache.lucene.analysis.Tokenizer) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 55 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class TestSimplePatternTokenizer method testEndOffset.

public void testEndOffset() throws Exception {
    Tokenizer t = new SimplePatternTokenizer("a+");
    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
    t.setReader(new StringReader("aaabbb"));
    t.reset();
    assertTrue(t.incrementToken());
    assertEquals("aaa", termAtt.toString());
    assertFalse(t.incrementToken());
    t.end();
    assertEquals(6, offsetAtt.endOffset());
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Aggregations

OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)82 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)59 TokenStream (org.apache.lucene.analysis.TokenStream)47 StringReader (java.io.StringReader)36 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)33 IOException (java.io.IOException)25 ArrayList (java.util.ArrayList)23 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)17 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)12 Tokenizer (org.apache.lucene.analysis.Tokenizer)10 Reader (java.io.Reader)9 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)8 Analyzer (org.apache.lucene.analysis.Analyzer)7 Token (org.apache.lucene.analysis.Token)7 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)7 List (java.util.List)6 PackedTokenAttributeImpl (org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl)5 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 IndexReader (org.apache.lucene.index.IndexReader)5