Search in sources :

Example 1 with Attribute

use of org.apache.lucene.util.Attribute in project lucene-solr by apache.

the class SimplePreAnalyzedParser method toFormattedString.

@Override
public String toFormattedString(Field f) throws IOException {
    StringBuilder sb = new StringBuilder();
    sb.append(VERSION + " ");
    if (f.fieldType().stored()) {
        String s = f.stringValue();
        if (s != null) {
            // encode the equals sign
            s = s.replaceAll("=", "\\=");
            sb.append('=');
            sb.append(s);
            sb.append('=');
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        StringBuilder tok = new StringBuilder();
        boolean next = false;
        while (ts.incrementToken()) {
            if (next) {
                sb.append(' ');
            } else {
                next = true;
            }
            tok.setLength(0);
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                Attribute att = ts.getAttribute(cl);
                if (att == null) {
                    continue;
                }
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = escape(catt.buffer(), catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    char[] tTermChars = tatt.getBytesRef().utf8ToString().toCharArray();
                    tTerm = escape(tTermChars, tTermChars.length);
                } else {
                    if (tok.length() > 0)
                        tok.append(',');
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.append("f=" + Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.append("s=" + ((OffsetAttribute) att).startOffset() + ",e=" + ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.append("p=" + bytesToHex(p.bytes, p.offset, p.length));
                        } else if (tok.length() > 0) {
                            // remove the last comma
                            tok.setLength(tok.length() - 1);
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.append("i=" + ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.append("y=" + escape(((TypeAttribute) att).type()));
                    } else {
                        tok.append(cl.getName() + "=" + escape(att.toString()));
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                if (tok.length() > 0) {
                    tok.insert(0, term + ",");
                } else {
                    tok.insert(0, term);
                }
            }
            sb.append(tok);
        }
    }
    return sb.toString();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Attribute(org.apache.lucene.util.Attribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with Attribute

use of org.apache.lucene.util.Attribute in project commons by twitter.

the class TokenizerUsageExample method main.

public static void main(String[] args) {
    // This is the canonical way to create a token stream.
    DefaultTextTokenizer tokenizer = new DefaultTextTokenizer.Builder().setKeepPunctuation(true).build();
    TwitterTokenStream stream = tokenizer.getDefaultTokenStream();
    // We're going to ask the token stream what type of attributes it makes available. "Attributes"
    // can be understood as "annotations" on the original text.
    System.out.println("Attributes available:");
    Iterator<Class<? extends Attribute>> iter = stream.getAttributeClassesIterator();
    while (iter.hasNext()) {
        Class<? extends Attribute> c = iter.next();
        System.out.println(" - " + c.getCanonicalName());
    }
    System.out.println("");
    // We're now going to iterate through a few tweets and tokenize each in turn.
    for (String tweet : famousTweets) {
        // We're first going to demonstrate the "token-by-token" method of consuming tweets.
        System.out.println("Processing: " + tweet);
        // Reset the token stream to process new input.
        stream.reset(tweet);
        // Now we're going to consume tokens from the stream.
        int tokenCnt = 0;
        while (stream.incrementToken()) {
            // CharSequenceTermAttribute holds the actual token text. This is preferred over
            // TermAttribute because it avoids creating new String objects.
            CharSequenceTermAttribute termAttribute = stream.getAttribute(CharSequenceTermAttribute.class);
            // TokenTypeAttribute holds, as you'd expect, the type of the token.
            TokenTypeAttribute typeAttribute = stream.getAttribute(TokenTypeAttribute.class);
            System.out.println(String.format("token %2d (%3d, %3d) type: %12s, token: '%s'", tokenCnt, termAttribute.getOffset(), termAttribute.getLength() - termAttribute.getOffset(), typeAttribute.getType().name, termAttribute.getTermCharSequence()));
            tokenCnt++;
        }
        System.out.println("");
        // We're now going to demonstrate the TokenizedCharSequence API.
        // This should produce exactly the same result as above.
        tokenCnt = 0;
        System.out.println("Processing: " + tweet);
        TokenizedCharSequence tokSeq = tokenizer.tokenize(tweet);
        for (Token tok : tokSeq.getTokens()) {
            System.out.println(String.format("token %2d (%3d, %3d) type: %12s, token: '%s'", tokenCnt, tok.getOffset(), tok.getOffset() + tok.getLength(), tok.getType().name, tok.getTerm()));
            tokenCnt++;
        }
        System.out.println("");
    }
}
Also used : CharSequenceTermAttribute(com.twitter.common.text.token.attribute.CharSequenceTermAttribute) TokenTypeAttribute(com.twitter.common.text.token.attribute.TokenTypeAttribute) Attribute(org.apache.lucene.util.Attribute) Token(com.twitter.common.text.token.TokenizedCharSequence.Token) DefaultTextTokenizer(com.twitter.common.text.DefaultTextTokenizer) CharSequenceTermAttribute(com.twitter.common.text.token.attribute.CharSequenceTermAttribute) TokenizedCharSequence(com.twitter.common.text.token.TokenizedCharSequence) TokenTypeAttribute(com.twitter.common.text.token.attribute.TokenTypeAttribute) TwitterTokenStream(com.twitter.common.text.token.TwitterTokenStream)

Example 3 with Attribute

use of org.apache.lucene.util.Attribute in project lucene-solr by apache.

the class JsonPreAnalyzedParser method toFormattedString.

@Override
public String toFormattedString(Field f) throws IOException {
    Map<String, Object> map = new LinkedHashMap<>();
    map.put(VERSION_KEY, VERSION);
    if (f.fieldType().stored()) {
        String stringValue = f.stringValue();
        if (stringValue != null) {
            map.put(STRING_KEY, stringValue);
        }
        BytesRef binaryValue = f.binaryValue();
        if (binaryValue != null) {
            map.put(BINARY_KEY, Base64.byteArrayToBase64(binaryValue.bytes, binaryValue.offset, binaryValue.length));
        }
    }
    TokenStream ts = f.tokenStreamValue();
    if (ts != null) {
        List<Map<String, Object>> tokens = new LinkedList<>();
        while (ts.incrementToken()) {
            Iterator<Class<? extends Attribute>> it = ts.getAttributeClassesIterator();
            String cTerm = null;
            String tTerm = null;
            Map<String, Object> tok = new TreeMap<>();
            while (it.hasNext()) {
                Class<? extends Attribute> cl = it.next();
                Attribute att = ts.getAttribute(cl);
                if (att == null) {
                    continue;
                }
                if (cl.isAssignableFrom(CharTermAttribute.class)) {
                    CharTermAttribute catt = (CharTermAttribute) att;
                    cTerm = new String(catt.buffer(), 0, catt.length());
                } else if (cl.isAssignableFrom(TermToBytesRefAttribute.class)) {
                    TermToBytesRefAttribute tatt = (TermToBytesRefAttribute) att;
                    tTerm = tatt.getBytesRef().utf8ToString();
                } else {
                    if (cl.isAssignableFrom(FlagsAttribute.class)) {
                        tok.put(FLAGS_KEY, Integer.toHexString(((FlagsAttribute) att).getFlags()));
                    } else if (cl.isAssignableFrom(OffsetAttribute.class)) {
                        tok.put(OFFSET_START_KEY, ((OffsetAttribute) att).startOffset());
                        tok.put(OFFSET_END_KEY, ((OffsetAttribute) att).endOffset());
                    } else if (cl.isAssignableFrom(PayloadAttribute.class)) {
                        BytesRef p = ((PayloadAttribute) att).getPayload();
                        if (p != null && p.length > 0) {
                            tok.put(PAYLOAD_KEY, Base64.byteArrayToBase64(p.bytes, p.offset, p.length));
                        }
                    } else if (cl.isAssignableFrom(PositionIncrementAttribute.class)) {
                        tok.put(POSINCR_KEY, ((PositionIncrementAttribute) att).getPositionIncrement());
                    } else if (cl.isAssignableFrom(TypeAttribute.class)) {
                        tok.put(TYPE_KEY, ((TypeAttribute) att).type());
                    } else {
                        tok.put(cl.getName(), att.toString());
                    }
                }
            }
            String term = null;
            if (cTerm != null) {
                term = cTerm;
            } else {
                term = tTerm;
            }
            if (term != null && term.length() > 0) {
                tok.put(TOKEN_KEY, term);
            }
            tokens.add(tok);
        }
        map.put(TOKENS_KEY, tokens);
    }
    return JSONUtil.toJSON(map, -1);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Attribute(org.apache.lucene.util.Attribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) TreeMap(java.util.TreeMap) LinkedList(java.util.LinkedList) LinkedHashMap(java.util.LinkedHashMap) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) TreeMap(java.util.TreeMap) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

Attribute (org.apache.lucene.util.Attribute)3 TokenStream (org.apache.lucene.analysis.TokenStream)2 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)2 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)2 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)2 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)2 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)2 TermToBytesRefAttribute (org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute)2 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)2 BytesRef (org.apache.lucene.util.BytesRef)2 DefaultTextTokenizer (com.twitter.common.text.DefaultTextTokenizer)1 TokenizedCharSequence (com.twitter.common.text.token.TokenizedCharSequence)1 Token (com.twitter.common.text.token.TokenizedCharSequence.Token)1 TwitterTokenStream (com.twitter.common.text.token.TwitterTokenStream)1 CharSequenceTermAttribute (com.twitter.common.text.token.attribute.CharSequenceTermAttribute)1 TokenTypeAttribute (com.twitter.common.text.token.attribute.TokenTypeAttribute)1 LinkedHashMap (java.util.LinkedHashMap)1 LinkedList (java.util.LinkedList)1 Map (java.util.Map)1 TreeMap (java.util.TreeMap)1