Search in sources :

Example 56 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class TestRemoveDuplicatesTokenFilter method testDups.

public void testDups(final String expected, final Token... tokens) throws Exception {
    final Iterator<Token> toks = Arrays.asList(tokens).iterator();
    final TokenStream ts = new RemoveDuplicatesTokenFilter((new TokenStream() {

        CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

        OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

        PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);

        @Override
        public boolean incrementToken() {
            if (toks.hasNext()) {
                clearAttributes();
                Token tok = toks.next();
                termAtt.setEmpty().append(tok);
                offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
                posIncAtt.setPositionIncrement(tok.getPositionIncrement());
                return true;
            } else {
                return false;
            }
        }
    }));
    assertTokenStreamContents(ts, expected.split("\\s"));
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Token(org.apache.lucene.analysis.Token) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 57 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class TokenStreamFromTermVector method init.

//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
    assert !initialized;
    short dpEnumFlags = PostingsEnum.POSITIONS;
    if (vector.hasOffsets()) {
        dpEnumFlags |= PostingsEnum.OFFSETS;
        offsetAttribute = addAttribute(OffsetAttribute.class);
    }
    if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
        //must ask for offsets too
        dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
        payloadAttribute = getAttribute(PayloadAttribute.class);
        payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
        spareBytesRefBuilder = new BytesRefBuilder();
    }
    // We put term data here
    termCharsBuilder = new CharsRefBuilder();
    //7 is over-estimate of average term len
    termCharsBuilder.grow((int) (vector.size() * 7));
    // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
    TokenLL[] positionedTokens = initTokensArray();
    int lastPosition = -1;
    final TermsEnum termsEnum = vector.iterator();
    BytesRef termBytesRef;
    PostingsEnum dpEnum = null;
    //only for UTF8->UTF16 call
    CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();
    //int sumFreq = 0;
    while ((termBytesRef = termsEnum.next()) != null) {
        //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
        // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
        tempCharsRefBuilder.grow(termBytesRef.length);
        final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
        final int termCharsOff = termCharsBuilder.length();
        termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
        dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
        // presumably checked by TokenSources.hasPositions earlier
        assert dpEnum != null;
        dpEnum.nextDoc();
        final int freq = dpEnum.freq();
        //sumFreq += freq;
        for (int j = 0; j < freq; j++) {
            int pos = dpEnum.nextPosition();
            TokenLL token = new TokenLL();
            token.termCharsOff = termCharsOff;
            token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
            if (offsetAttribute != null) {
                token.startOffset = dpEnum.startOffset();
                if (token.startOffset > maxStartOffset) {
                    //filter this token out; exceeds threshold
                    continue;
                }
                token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
                if (pos == -1) {
                    //divide by 8
                    pos = token.startOffset >> 3;
                }
            }
            if (payloadAttribute != null) {
                final BytesRef payload = dpEnum.getPayload();
                token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
            }
            //Add token to an array indexed by position
            if (positionedTokens.length <= pos) {
                //grow, but not 2x since we think our original length estimate is close
                TokenLL[] newPositionedTokens = new TokenLL[(int) ((pos + 1) * 1.5f)];
                System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
                positionedTokens = newPositionedTokens;
            }
            positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]);
            lastPosition = Math.max(lastPosition, pos);
        }
    }
    //    System.out.println(String.format(
    //        "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
    //        sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
    //        (originalPositionEstimate/(lastPosition + 1.0f))));
    // Step 2:  Link all Tokens into a linked-list and set position increments as we go
    int prevTokenPos = -1;
    TokenLL prevToken = null;
    for (int pos = 0; pos <= lastPosition; pos++) {
        TokenLL token = positionedTokens[pos];
        if (token == null) {
            continue;
        }
        //link
        if (prevToken != null) {
            assert prevToken.next == null;
            //concatenate linked-list
            prevToken.next = token;
        } else {
            assert firstToken == null;
            firstToken = token;
        }
        //set increments
        if (vector.hasPositions()) {
            token.positionIncrement = pos - prevTokenPos;
            while (token.next != null) {
                token = token.next;
                token.positionIncrement = 0;
            }
        } else {
            token.positionIncrement = 1;
            while (token.next != null) {
                prevToken = token;
                token = token.next;
                if (prevToken.startOffset == token.startOffset) {
                    token.positionIncrement = 0;
                } else {
                    token.positionIncrement = 1;
                }
            }
        }
        prevTokenPos = pos;
        prevToken = token;
    }
    initialized = true;
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) BytesRefArray(org.apache.lucene.util.BytesRefArray) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 58 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class JsonPreAnalyzedParser method toFormattedString.

@Override
public String toFormattedString(Field f) throws IOException {
    Map<String, Object> map = new LinkedHashMap<>();
    map.put(VERSION_KEY, VERSION);
    if (f.fieldType().stored()) {
        String stringValue = f.stringValue();
        if (stringValue != null) {
            map.put(STRING_KEY, stringValue);
        }
        BytesRef binaryValue = f.binaryValue();
        if (binaryValue != null) {
            map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        List<Map<String, Object>> tokens = new LinkedList<>();
        while (ts.incrementToken()) {
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            Map<String, Object> tok = new TreeMap<>();
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                Attribute att = ts.getAttribute(cl);
                if (att == null) {
                    continue;
                }
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = new String(catt.buffer(), 0, catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    tTerm = tatt.getBytesRef().utf8ToString();
                } else {
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.put(OFFSET_START_KEY, ((OffsetAttribute) att).startOffset());
                        tok.put(OFFSET_END_KEY, ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.put(POSINCR_KEY, ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.put(TYPE_KEY, ((TypeAttribute) att).type());
                    } else {
                        tok.put(cl.getName(), att.toString());
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                tok.put(TOKEN_KEY, term);
            }
            tokens.add(tok);
        }
        map.put(TOKENS_KEY, tokens);
    }
    return JSONUtil.toJSON(map, -1);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Attribute(org.apache.lucene.util.Attribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) TreeMap(java.util.TreeMap) LinkedList(java.util.LinkedList) LinkedHashMap(java.util.LinkedHashMap) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) TreeMap(java.util.TreeMap) BytesRef(org.apache.lucene.util.BytesRef)

Example 59 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class SimplePreAnalyzedParser method createState.

private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) {
    a.clearAttributes();
    CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class);
    char[] tokChars = state.token.toString().toCharArray();
    termAtt.copyBuffer(tokChars, 0, tokChars.length);
    int tokenStart = tokenEnd - state.token.length();
    for (Entry<String, String> e : state.attr.entrySet()) {
        String k = e.getKey();
        if (k.equals("i")) {
            // position increment
            int incr = Integer.parseInt(e.getValue());
            PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class);
            posIncr.setPositionIncrement(incr);
        } else if (k.equals("s")) {
            tokenStart = Integer.parseInt(e.getValue());
        } else if (k.equals("e")) {
            tokenEnd = Integer.parseInt(e.getValue());
        } else if (k.equals("y")) {
            TypeAttribute type = a.addAttribute(TypeAttribute.class);
            type.setType(e.getValue());
        } else if (k.equals("f")) {
            FlagsAttribute flags = a.addAttribute(FlagsAttribute.class);
            int f = Integer.parseInt(e.getValue(), 16);
            flags.setFlags(f);
        } else if (k.equals("p")) {
            PayloadAttribute p = a.addAttribute(PayloadAttribute.class);
            byte[] data = hexToBytes(e.getValue());
            if (data != null && data.length > 0) {
                p.setPayload(new BytesRef(data));
            }
        } else {
        // unknown attribute
        }
    }
    // handle offset attr
    OffsetAttribute offset = a.addAttribute(OffsetAttribute.class);
    offset.setOffset(tokenStart, tokenEnd);
    State resState = a.captureState();
    a.clearAttributes();
    return resState;
}
Also used : FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) State(org.apache.lucene.util.AttributeSource.State) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 60 with OffsetAttribute

use of org.apache.lucene.analysis.tokenattributes.OffsetAttribute in project lucene-solr by apache.

the class TestDuelingAnalyzers method assertEquals.

// we only check a few core attributes here.
// TODO: test other things
public void assertEquals(String s, TokenStream left, TokenStream right) throws Exception {
    left.reset();
    right.reset();
    CharTermAttribute leftTerm = left.addAttribute(CharTermAttribute.class);
    CharTermAttribute rightTerm = right.addAttribute(CharTermAttribute.class);
    OffsetAttribute leftOffset = left.addAttribute(OffsetAttribute.class);
    OffsetAttribute rightOffset = right.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute leftPos = left.addAttribute(PositionIncrementAttribute.class);
    PositionIncrementAttribute rightPos = right.addAttribute(PositionIncrementAttribute.class);
    while (left.incrementToken()) {
        assertTrue("wrong number of tokens for input: " + s, right.incrementToken());
        assertEquals("wrong term text for input: " + s, leftTerm.toString(), rightTerm.toString());
        assertEquals("wrong position for input: " + s, leftPos.getPositionIncrement(), rightPos.getPositionIncrement());
        assertEquals("wrong start offset for input: " + s, leftOffset.startOffset(), rightOffset.startOffset());
        assertEquals("wrong end offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
    }
    ;
    assertFalse("wrong number of tokens for input: " + s, right.incrementToken());
    left.end();
    right.end();
    assertEquals("wrong final offset for input: " + s, leftOffset.endOffset(), rightOffset.endOffset());
    left.close();
    right.close();
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Aggregations

OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)82 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)59 TokenStream (org.apache.lucene.analysis.TokenStream)47 StringReader (java.io.StringReader)36 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)33 IOException (java.io.IOException)25 ArrayList (java.util.ArrayList)23 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)17 BytesRef (org.apache.lucene.util.BytesRef)14 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)12 Tokenizer (org.apache.lucene.analysis.Tokenizer)10 Reader (java.io.Reader)9 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)8 Analyzer (org.apache.lucene.analysis.Analyzer)7 Token (org.apache.lucene.analysis.Token)7 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)7 List (java.util.List)6 PackedTokenAttributeImpl (org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl)5 PositionLengthAttribute (org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute)5 IndexReader (org.apache.lucene.index.IndexReader)5