Search in sources :

Example 21 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.

the class DelimitedPayloadTokenFilterTest method testPayloads.

public void testPayloads() throws Exception {
    String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
    DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(whitespaceMockTokenizer(test), DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
    PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
    filter.reset();
    assertTermEquals("The", filter, termAtt, payAtt, null);
    assertTermEquals("quick", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
    assertTermEquals("red", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
    assertTermEquals("fox", filter, termAtt, payAtt, "NN".getBytes(StandardCharsets.UTF_8));
    assertTermEquals("jumped", filter, termAtt, payAtt, "VB".getBytes(StandardCharsets.UTF_8));
    assertTermEquals("over", filter, termAtt, payAtt, null);
    assertTermEquals("the", filter, termAtt, payAtt, null);
    assertTermEquals("lazy", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
    assertTermEquals("brown", filter, termAtt, payAtt, "JJ".getBytes(StandardCharsets.UTF_8));
    assertTermEquals("dogs", filter, termAtt, payAtt, "NN".getBytes(StandardCharsets.UTF_8));
    assertFalse(filter.incrementToken());
    filter.end();
    filter.close();
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Example 22 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.

the class DelimitedPayloadTokenFilterTest method assertTermEquals.

void assertTermEquals(String expected, TokenStream stream, byte[] expectPay) throws Exception {
    CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class);
    assertTrue(stream.incrementToken());
    assertEquals(expected, termAtt.toString());
    BytesRef payload = payloadAtt.getPayload();
    if (payload != null) {
        assertTrue(payload.length + " does not equal: " + expectPay.length, payload.length == expectPay.length);
        for (int i = 0; i < expectPay.length; i++) {
            assertTrue(expectPay[i] + " does not equal: " + payload.bytes[i + payload.offset], expectPay[i] == payload.bytes[i + payload.offset]);
        }
    } else {
        assertTrue("expectPay is not null and it should be", expectPay == null);
    }
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Example 23 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.

the class DelimitedPayloadTokenFilterTest method testFloatEncoding.

public void testFloatEncoding() throws Exception {
    String test = "The quick|1.0 red|2.0 fox|3.5 jumped|0.5 over the lazy|5 brown|99.3 dogs|83.7";
    DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(whitespaceMockTokenizer(test), '|', new FloatEncoder());
    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
    PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
    filter.reset();
    assertTermEquals("The", filter, termAtt, payAtt, null);
    assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeFloat(1.0f));
    assertTermEquals("red", filter, termAtt, payAtt, PayloadHelper.encodeFloat(2.0f));
    assertTermEquals("fox", filter, termAtt, payAtt, PayloadHelper.encodeFloat(3.5f));
    assertTermEquals("jumped", filter, termAtt, payAtt, PayloadHelper.encodeFloat(0.5f));
    assertTermEquals("over", filter, termAtt, payAtt, null);
    assertTermEquals("the", filter, termAtt, payAtt, null);
    assertTermEquals("lazy", filter, termAtt, payAtt, PayloadHelper.encodeFloat(5.0f));
    assertTermEquals("brown", filter, termAtt, payAtt, PayloadHelper.encodeFloat(99.3f));
    assertTermEquals("dogs", filter, termAtt, payAtt, PayloadHelper.encodeFloat(83.7f));
    assertFalse(filter.incrementToken());
    filter.end();
    filter.close();
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Example 24 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.

the class TokenStreamFromTermVector method init.

//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
    assert !initialized;
    short dpEnumFlags = PostingsEnum.POSITIONS;
    if (vector.hasOffsets()) {
        dpEnumFlags |= PostingsEnum.OFFSETS;
        offsetAttribute = addAttribute(OffsetAttribute.class);
    }
    if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
        //must ask for offsets too
        dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
        payloadAttribute = getAttribute(PayloadAttribute.class);
        payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
        spareBytesRefBuilder = new BytesRefBuilder();
    }
    // We put term data here
    termCharsBuilder = new CharsRefBuilder();
    //7 is over-estimate of average term len
    termCharsBuilder.grow((int) (vector.size() * 7));
    // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
    TokenLL[] positionedTokens = initTokensArray();
    int lastPosition = -1;
    final TermsEnum termsEnum = vector.iterator();
    BytesRef termBytesRef;
    PostingsEnum dpEnum = null;
    //only for UTF8->UTF16 call
    CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();
    //int sumFreq = 0;
    while ((termBytesRef = termsEnum.next()) != null) {
        //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
        // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
        tempCharsRefBuilder.grow(termBytesRef.length);
        final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
        final int termCharsOff = termCharsBuilder.length();
        termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
        dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
        // presumably checked by TokenSources.hasPositions earlier
        assert dpEnum != null;
        dpEnum.nextDoc();
        final int freq = dpEnum.freq();
        //sumFreq += freq;
        for (int j = 0; j < freq; j++) {
            int pos = dpEnum.nextPosition();
            TokenLL token = new TokenLL();
            token.termCharsOff = termCharsOff;
            token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
            if (offsetAttribute != null) {
                token.startOffset = dpEnum.startOffset();
                if (token.startOffset > maxStartOffset) {
                    //filter this token out; exceeds threshold
                    continue;
                }
                token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
                if (pos == -1) {
                    //divide by 8
                    pos = token.startOffset >> 3;
                }
            }
            if (payloadAttribute != null) {
                final BytesRef payload = dpEnum.getPayload();
                token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
            }
            //Add token to an array indexed by position
            if (positionedTokens.length <= pos) {
                //grow, but not 2x since we think our original length estimate is close
                TokenLL[] newPositionedTokens = new TokenLL[(int) ((pos + 1) * 1.5f)];
                System.arraycopy(positionedTokens, 0, newPositionedTokens, 0, lastPosition + 1);
                positionedTokens = newPositionedTokens;
            }
            positionedTokens[pos] = token.insertIntoSortedLinkedList(positionedTokens[pos]);
            lastPosition = Math.max(lastPosition, pos);
        }
    }
    //    System.out.println(String.format(
    //        "SumFreq: %5d Size: %4d SumFreq/size: %3.3f MaxPos: %4d MaxPos/SumFreq: %3.3f WastePct: %3.3f",
    //        sumFreq, vector.size(), (sumFreq / (float)vector.size()), lastPosition, ((float)lastPosition)/sumFreq,
    //        (originalPositionEstimate/(lastPosition + 1.0f))));
    // Step 2:  Link all Tokens into a linked-list and set position increments as we go
    int prevTokenPos = -1;
    TokenLL prevToken = null;
    for (int pos = 0; pos <= lastPosition; pos++) {
        TokenLL token = positionedTokens[pos];
        if (token == null) {
            continue;
        }
        //link
        if (prevToken != null) {
            assert prevToken.next == null;
            //concatenate linked-list
            prevToken.next = token;
        } else {
            assert firstToken == null;
            firstToken = token;
        }
        //set increments
        if (vector.hasPositions()) {
            token.positionIncrement = pos - prevTokenPos;
            while (token.next != null) {
                token = token.next;
                token.positionIncrement = 0;
            }
        } else {
            token.positionIncrement = 1;
            while (token.next != null) {
                prevToken = token;
                token = token.next;
                if (prevToken.startOffset == token.startOffset) {
                    token.positionIncrement = 0;
                } else {
                    token.positionIncrement = 1;
                }
            }
        }
        prevTokenPos = pos;
        prevToken = token;
    }
    initialized = true;
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) BytesRefArray(org.apache.lucene.util.BytesRefArray) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 25 with PayloadAttribute

use of org.apache.lucene.analysis.tokenattributes.PayloadAttribute in project lucene-solr by apache.

the class JsonPreAnalyzedParser method toFormattedString.

@Override
public String toFormattedString(Field f) throws IOException {
    Map<String, Object> map = new LinkedHashMap<>();
    map.put(VERSION_KEY, VERSION);
    if (f.fieldType().stored()) {
        String stringValue = f.stringValue();
        if (stringValue != null) {
            map.put(STRING_KEY, stringValue);
        }
        BytesRef binaryValue = f.binaryValue();
        if (binaryValue != null) {
            map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        List<Map<String, Object>> tokens = new LinkedList<>();
        while (ts.incrementToken()) {
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            Map<String, Object> tok = new TreeMap<>();
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                Attribute att = ts.getAttribute(cl);
                if (att == null) {
                    continue;
                }
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = new String(catt.buffer(), 0, catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    tTerm = tatt.getBytesRef().utf8ToString();
                } else {
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.put(OFFSET_START_KEY, ((OffsetAttribute) att).startOffset());
                        tok.put(OFFSET_END_KEY, ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.put(POSINCR_KEY, ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.put(TYPE_KEY, ((TypeAttribute) att).type());
                    } else {
                        tok.put(cl.getName(), att.toString());
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                tok.put(TOKEN_KEY, term);
            }
            tokens.add(tok);
        }
        map.put(TOKENS_KEY, tokens);
    }
    return JSONUtil.toJSON(map, -1);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Attribute(org.apache.lucene.util.Attribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) TreeMap(java.util.TreeMap) LinkedList(java.util.LinkedList) LinkedHashMap(java.util.LinkedHashMap) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) TreeMap(java.util.TreeMap) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)27 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)16 TokenStream (org.apache.lucene.analysis.TokenStream)14 BytesRef (org.apache.lucene.util.BytesRef)13 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)12 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)11 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)10 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)7 StringReader (java.io.StringReader)6 IOException (java.io.IOException)5 Document (org.apache.lucene.document.Document)5 Reader (java.io.Reader)4 Token (org.apache.lucene.analysis.Token)4 Field (org.apache.lucene.document.Field)4 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)3 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)3 LinkedHashMap (java.util.LinkedHashMap)2 LinkedList (java.util.LinkedList)2 Map (java.util.Map)2 TreeMap (java.util.TreeMap)2