Search in sources :

Example 1 with State

use of org.apache.lucene.util.AttributeSource.State in project lucene-solr by apache.

the class JsonPreAnalyzedParser method parse.

@SuppressWarnings("unchecked")
@Override
public ParseResult parse(Reader reader, AttributeSource parent) throws IOException {
    ParseResult res = new ParseResult();
    StringBuilder sb = new StringBuilder();
    char[] buf = new char[128];
    int cnt;
    while ((cnt = reader.read(buf)) > 0) {
        sb.append(buf, 0, cnt);
    }
    String val = sb.toString();
    // empty string - accept even without version number
    if (val.length() == 0) {
        return res;
    }
    Object o = ObjectBuilder.fromJSON(val);
    if (!(o instanceof Map)) {
        throw new IOException("Invalid JSON type " + o.getClass().getName() + ", expected Map");
    }
    Map<String, Object> map = (Map<String, Object>) o;
    // check version
    String version = (String) map.get(VERSION_KEY);
    if (version == null) {
        throw new IOException("Missing VERSION key");
    }
    if (!VERSION.equals(version)) {
        throw new IOException("Unknown VERSION '" + version + "', expected " + VERSION);
    }
    if (map.containsKey(STRING_KEY) && map.containsKey(BINARY_KEY)) {
        throw new IOException("Field cannot have both stringValue and binaryValue");
    }
    res.str = (String) map.get(STRING_KEY);
    String bin = (String) map.get(BINARY_KEY);
    if (bin != null) {
        byte[] data = Base64.base64ToByteArray(bin);
        res.bin = data;
    }
    List<Object> tokens = (List<Object>) map.get(TOKENS_KEY);
    if (tokens == null) {
        return res;
    }
    int tokenStart = 0;
    int tokenEnd = 0;
    parent.clearAttributes();
    for (Object ot : tokens) {
        // automatic increment by 1 separator
        tokenStart = tokenEnd + 1;
        Map<String, Object> tok = (Map<String, Object>) ot;
        boolean hasOffsetStart = false;
        boolean hasOffsetEnd = false;
        int len = -1;
        for (Entry<String, Object> e : tok.entrySet()) {
            String key = e.getKey();
            if (key.equals(TOKEN_KEY)) {
                CharTermAttribute catt = parent.addAttribute(CharTermAttribute.class);
                String str = String.valueOf(e.getValue());
                catt.append(str);
                len = str.length();
            } else if (key.equals(OFFSET_START_KEY)) {
                Object obj = e.getValue();
                hasOffsetStart = true;
                if (obj instanceof Number) {
                    tokenStart = ((Number) obj).intValue();
                } else {
                    try {
                        tokenStart = Integer.parseInt(String.valueOf(obj));
                    } catch (NumberFormatException nfe) {
                        LOG.warn("Invalid " + OFFSET_START_KEY + " attribute, skipped: '" + obj + "'");
                        hasOffsetStart = false;
                    }
                }
            } else if (key.equals(OFFSET_END_KEY)) {
                hasOffsetEnd = true;
                Object obj = e.getValue();
                if (obj instanceof Number) {
                    tokenEnd = ((Number) obj).intValue();
                } else {
                    try {
                        tokenEnd = Integer.parseInt(String.valueOf(obj));
                    } catch (NumberFormatException nfe) {
                        LOG.warn("Invalid " + OFFSET_END_KEY + " attribute, skipped: '" + obj + "'");
                        hasOffsetEnd = false;
                    }
                }
            } else if (key.equals(POSINCR_KEY)) {
                Object obj = e.getValue();
                int posIncr = 1;
                if (obj instanceof Number) {
                    posIncr = ((Number) obj).intValue();
                } else {
                    try {
                        posIncr = Integer.parseInt(String.valueOf(obj));
                    } catch (NumberFormatException nfe) {
                        LOG.warn("Invalid " + POSINCR_KEY + " attribute, skipped: '" + obj + "'");
                    }
                }
                PositionIncrementAttribute patt = parent.addAttribute(PositionIncrementAttribute.class);
                patt.setPositionIncrement(posIncr);
            } else if (key.equals(PAYLOAD_KEY)) {
                String str = String.valueOf(e.getValue());
                if (str.length() > 0) {
                    byte[] data = Base64.base64ToByteArray(str);
                    PayloadAttribute p = parent.addAttribute(PayloadAttribute.class);
                    if (data != null && data.length > 0) {
                        p.setPayload(new BytesRef(data));
                    }
                }
            } else if (key.equals(FLAGS_KEY)) {
                try {
                    int f = Integer.parseInt(String.valueOf(e.getValue()), 16);
                    FlagsAttribute flags = parent.addAttribute(FlagsAttribute.class);
                    flags.setFlags(f);
                } catch (NumberFormatException nfe) {
                    LOG.warn("Invalid " + FLAGS_KEY + " attribute, skipped: '" + e.getValue() + "'");
                }
            } else if (key.equals(TYPE_KEY)) {
                TypeAttribute tattr = parent.addAttribute(TypeAttribute.class);
                tattr.setType(String.valueOf(e.getValue()));
            } else {
                LOG.warn("Unknown attribute, skipped: " + e.getKey() + "=" + e.getValue());
            }
        }
        // handle offset attr
        OffsetAttribute offset = parent.addAttribute(OffsetAttribute.class);
        if (!hasOffsetEnd && len > -1) {
            tokenEnd = tokenStart + len;
        }
        offset.setOffset(tokenStart, tokenEnd);
        if (!hasOffsetStart) {
            tokenStart = tokenEnd + 1;
        }
        // capture state and add to result
        State state = parent.captureState();
        res.states.add(state.clone());
        // reset for reuse
        parent.clearAttributes();
    }
    return res;
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) ParseResult(org.apache.solr.schema.PreAnalyzedField.ParseResult) IOException(java.io.IOException) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) State(org.apache.lucene.util.AttributeSource.State) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) LinkedList(java.util.LinkedList) List(java.util.List) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) TreeMap(java.util.TreeMap) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with State

use of org.apache.lucene.util.AttributeSource.State in project commons by twitter.

the class TokenGroupAttributeImpl method getTokenGroupStream.

@Override
public TokenGroupStream getTokenGroupStream() {
    //Lazily process the sequence into a set of states, only do it when getTokenGroupStream is called
    if ((attributeClasses == null || states.isEmpty()) && seq != null) {
        TokenizedCharSequenceStream ret = new TokenizedCharSequenceStream();
        ret.reset(seq);
        //TODO(alewis) This could probably be lazier. Make a new extension of TokenGroupStream?
        ImmutableList.Builder<State> builder = ImmutableList.builder();
        while (ret.incrementToken()) {
            builder.add(ret.captureState());
        }
        setAttributeSource(ret);
        setStates(builder.build());
    }
    // lazy initialize tokenGroupStream
    if (tokenGroupStream == null) {
        tokenGroupStream = new TokenGroupStream(attributeClasses);
    }
    tokenGroupStream.setStates(states);
    return tokenGroupStream;
}
Also used : TokenizedCharSequenceStream(com.twitter.common.text.token.TokenizedCharSequenceStream) TokenGroupStream(com.twitter.common.text.token.TokenGroupStream) ImmutableList(com.google.common.collect.ImmutableList) State(org.apache.lucene.util.AttributeSource.State)

Example 3 with State

use of org.apache.lucene.util.AttributeSource.State in project commons by twitter.

the class TokenGroupAttributeImpl method clone.

@Override
public AttributeImpl clone() {
    TokenGroupAttributeImpl clone = new TokenGroupAttributeImpl();
    // we don't need to clone attributeClasses because it's immutable.
    clone.attributeClasses = attributeClasses;
    // same here. TokenizedCharSequence is an immutable obj so no need to clone.
    clone.seq = seq;
    ImmutableList.Builder<State> builder = ImmutableList.builder();
    for (State state : states) {
        builder.add(state.clone());
    }
    clone.states = builder.build();
    clone.tokenGroupStream = null;
    return clone;
}
Also used : ImmutableList(com.google.common.collect.ImmutableList) State(org.apache.lucene.util.AttributeSource.State)

Example 4 with State

use of org.apache.lucene.util.AttributeSource.State in project lucene-solr by apache.

the class SimplePreAnalyzedParser method createState.

private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) {
    a.clearAttributes();
    CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class);
    char[] tokChars = state.token.toString().toCharArray();
    termAtt.copyBuffer(tokChars, 0, tokChars.length);
    int tokenStart = tokenEnd - state.token.length();
    for (Entry<String, String> e : state.attr.entrySet()) {
        String k = e.getKey();
        if (k.equals("i")) {
            // position increment
            int incr = Integer.parseInt(e.getValue());
            PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class);
            posIncr.setPositionIncrement(incr);
        } else if (k.equals("s")) {
            tokenStart = Integer.parseInt(e.getValue());
        } else if (k.equals("e")) {
            tokenEnd = Integer.parseInt(e.getValue());
        } else if (k.equals("y")) {
            TypeAttribute type = a.addAttribute(TypeAttribute.class);
            type.setType(e.getValue());
        } else if (k.equals("f")) {
            FlagsAttribute flags = a.addAttribute(FlagsAttribute.class);
            int f = Integer.parseInt(e.getValue(), 16);
            flags.setFlags(f);
        } else if (k.equals("p")) {
            PayloadAttribute p = a.addAttribute(PayloadAttribute.class);
            byte[] data = hexToBytes(e.getValue());
            if (data != null && data.length > 0) {
                p.setPayload(new BytesRef(data));
            }
        } else {
        // unknown attribute
        }
    }
    // handle offset attr
    OffsetAttribute offset = a.addAttribute(OffsetAttribute.class);
    offset.setOffset(tokenStart, tokenEnd);
    State resState = a.captureState();
    a.clearAttributes();
    return resState;
}
Also used : FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) State(org.apache.lucene.util.AttributeSource.State) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 5 with State

use of org.apache.lucene.util.AttributeSource.State in project lucene-solr by apache.

the class SimplePreAnalyzedParser method parse.

@Override
public ParseResult parse(Reader reader, AttributeSource parent) throws IOException {
    ParseResult res = new ParseResult();
    StringBuilder sb = new StringBuilder();
    char[] buf = new char[128];
    int cnt;
    while ((cnt = reader.read(buf)) > 0) {
        sb.append(buf, 0, cnt);
    }
    String val = sb.toString();
    // empty string - accept even without version number
    if (val.length() == 0) {
        return res;
    }
    // first consume the version
    int idx = val.indexOf(' ');
    if (idx == -1) {
        throw new IOException("Missing VERSION token");
    }
    String version = val.substring(0, idx);
    if (!VERSION.equals(version)) {
        throw new IOException("Unknown VERSION " + version);
    }
    val = val.substring(idx + 1);
    // then consume the optional stored part
    int tsStart = 0;
    boolean hasStored = false;
    StringBuilder storedBuf = new StringBuilder();
    if (val.charAt(0) == '=') {
        hasStored = true;
        if (val.length() > 1) {
            for (int i = 1; i < val.length(); i++) {
                char c = val.charAt(i);
                if (c == '\\') {
                    if (i < val.length() - 1) {
                        c = val.charAt(++i);
                        if (c == '=') {
                            // we recognize only \= escape in the stored part
                            storedBuf.append('=');
                        } else {
                            storedBuf.append('\\');
                            storedBuf.append(c);
                            continue;
                        }
                    } else {
                        storedBuf.append(c);
                        continue;
                    }
                } else if (c == '=') {
                    // end of stored text
                    tsStart = i + 1;
                    break;
                } else {
                    storedBuf.append(c);
                }
            }
            if (tsStart == 0) {
                // missing end-of-stored marker
                throw new IOException("Missing end marker of stored part");
            }
        } else {
            throw new IOException("Unexpected end of stored field");
        }
    }
    if (hasStored) {
        res.str = storedBuf.toString();
    }
    Tok tok = new Tok();
    StringBuilder attName = new StringBuilder();
    StringBuilder attVal = new StringBuilder();
    // parser state
    S s = S.UNDEF;
    int lastPos = 0;
    for (int i = tsStart; i < val.length(); i++) {
        char c = val.charAt(i);
        if (c == ' ') {
            // collect leftovers
            switch(s) {
                case VALUE:
                    if (attVal.length() == 0) {
                        throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
                    }
                    if (attName.length() > 0) {
                        tok.attr.put(attName.toString(), attVal.toString());
                    }
                    break;
                case // attr name without a value ?
                NAME:
                    if (attName.length() > 0) {
                        throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
                    } else {
                    // accept missing att name and value
                    }
                    break;
                case TOKEN:
                case UNDEF:
            }
            attName.setLength(0);
            attVal.setLength(0);
            if (!tok.isEmpty() || s == S.NAME) {
                AttributeSource.State state = createState(parent, tok, lastPos);
                if (state != null)
                    res.states.add(state.clone());
            }
            // reset tok
            s = S.UNDEF;
            tok.reset();
            // skip
            lastPos++;
            continue;
        }
        StringBuilder tgt = null;
        switch(s) {
            case TOKEN:
                tgt = tok.token;
                break;
            case NAME:
                tgt = attName;
                break;
            case VALUE:
                tgt = attVal;
                break;
            case UNDEF:
                tgt = tok.token;
                s = S.TOKEN;
        }
        if (c == '\\') {
            if (s == S.TOKEN)
                lastPos++;
            if (i >= val.length() - 1) {
                // end
                tgt.append(c);
                continue;
            } else {
                c = val.charAt(++i);
                switch(c) {
                    case '\\':
                    case '=':
                    case ',':
                    case ' ':
                        tgt.append(c);
                        break;
                    case 'n':
                        tgt.append('\n');
                        break;
                    case 'r':
                        tgt.append('\r');
                        break;
                    case 't':
                        tgt.append('\t');
                        break;
                    default:
                        tgt.append('\\');
                        tgt.append(c);
                        lastPos++;
                }
            }
        } else {
            // state switch
            if (c == ',') {
                if (s == S.TOKEN) {
                    s = S.NAME;
                } else if (s == S.VALUE) {
                    // end of value, start of next attr
                    if (attVal.length() == 0) {
                        throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
                    }
                    if (attName.length() > 0 && attVal.length() > 0) {
                        tok.attr.put(attName.toString(), attVal.toString());
                    }
                    // reset
                    attName.setLength(0);
                    attVal.setLength(0);
                    s = S.NAME;
                } else {
                    throw new IOException("Unexpected character '" + c + "' at position " + i + " - missing attribute value.");
                }
            } else if (c == '=') {
                if (s == S.NAME) {
                    s = S.VALUE;
                } else {
                    throw new IOException("Unexpected character '" + c + "' at position " + i + " - empty value of attribute.");
                }
            } else {
                tgt.append(c);
                if (s == S.TOKEN)
                    lastPos++;
            }
        }
    }
    // collect leftovers
    if (!tok.isEmpty() || s == S.NAME || s == S.VALUE) {
        // remaining attrib?
        if (s == S.VALUE) {
            if (attName.length() > 0 && attVal.length() > 0) {
                tok.attr.put(attName.toString(), attVal.toString());
            }
        }
        AttributeSource.State state = createState(parent, tok, lastPos);
        if (state != null)
            res.states.add(state.clone());
    }
    return res;
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) ParseResult(org.apache.solr.schema.PreAnalyzedField.ParseResult) State(org.apache.lucene.util.AttributeSource.State) IOException(java.io.IOException)

Aggregations

State (org.apache.lucene.util.AttributeSource.State)5 ImmutableList (com.google.common.collect.ImmutableList)2 IOException (java.io.IOException)2 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)2 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)2 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)2 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)2 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)2 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)2 BytesRef (org.apache.lucene.util.BytesRef)2 ParseResult (org.apache.solr.schema.PreAnalyzedField.ParseResult)2 TokenGroupStream (com.twitter.common.text.token.TokenGroupStream)1 TokenizedCharSequenceStream (com.twitter.common.text.token.TokenizedCharSequenceStream)1 LinkedHashMap (java.util.LinkedHashMap)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Map (java.util.Map)1 TreeMap (java.util.TreeMap)1 AttributeSource (org.apache.lucene.util.AttributeSource)1