Search in sources :

Example 1 with TokenTypeAttribute

use of com.twitter.common.text.token.attribute.TokenTypeAttribute in project commons by twitter.

the class TokenizerUsageExample method main.

public static void main(String[] args) {
    // This is the canonical way to create a token stream.
    DefaultTextTokenizer tokenizer = new DefaultTextTokenizer.Builder().setKeepPunctuation(true).build();
    TwitterTokenStream stream = tokenizer.getDefaultTokenStream();
    // We're going to ask the token stream what type of attributes it makes available. "Attributes"
    // can be understood as "annotations" on the original text.
    System.out.println("Attributes available:");
    Iterator<Class<? extends Attribute>> iter = stream.getAttributeClassesIterator();
    while (iter.hasNext()) {
        Class<? extends Attribute> c = iter.next();
        System.out.println(" - " + c.getCanonicalName());
    }
    System.out.println("");
    // We're now going to iterate through a few tweets and tokenize each in turn.
    for (String tweet : famousTweets) {
        // We're first going to demonstrate the "token-by-token" method of consuming tweets.
        System.out.println("Processing: " + tweet);
        // Reset the token stream to process new input.
        stream.reset(tweet);
        // Now we're going to consume tokens from the stream.
        int tokenCnt = 0;
        while (stream.incrementToken()) {
            // CharSequenceTermAttribute holds the actual token text. This is preferred over
            // TermAttribute because it avoids creating new String objects.
            CharSequenceTermAttribute termAttribute = stream.getAttribute(CharSequenceTermAttribute.class);
            // TokenTypeAttribute holds, as you'd expect, the type of the token.
            TokenTypeAttribute typeAttribute = stream.getAttribute(TokenTypeAttribute.class);
            System.out.println(String.format("token %2d (%3d, %3d) type: %12s, token: '%s'", tokenCnt, termAttribute.getOffset(), termAttribute.getLength() - termAttribute.getOffset(), typeAttribute.getType().name, termAttribute.getTermCharSequence()));
            tokenCnt++;
        }
        System.out.println("");
        // We're now going to demonstrate the TokenizedCharSequence API.
        // This should produce exactly the same result as above.
        tokenCnt = 0;
        System.out.println("Processing: " + tweet);
        TokenizedCharSequence tokSeq = tokenizer.tokenize(tweet);
        for (Token tok : tokSeq.getTokens()) {
            System.out.println(String.format("token %2d (%3d, %3d) type: %12s, token: '%s'", tokenCnt, tok.getOffset(), tok.getOffset() + tok.getLength(), tok.getType().name, tok.getTerm()));
            tokenCnt++;
        }
        System.out.println("");
    }
}
Also used : CharSequenceTermAttribute(com.twitter.common.text.token.attribute.CharSequenceTermAttribute) TokenTypeAttribute(com.twitter.common.text.token.attribute.TokenTypeAttribute) Attribute(org.apache.lucene.util.Attribute) Token(com.twitter.common.text.token.TokenizedCharSequence.Token) DefaultTextTokenizer(com.twitter.common.text.DefaultTextTokenizer) CharSequenceTermAttribute(com.twitter.common.text.token.attribute.CharSequenceTermAttribute) TokenizedCharSequence(com.twitter.common.text.token.TokenizedCharSequence) TokenTypeAttribute(com.twitter.common.text.token.attribute.TokenTypeAttribute) TwitterTokenStream(com.twitter.common.text.token.TwitterTokenStream)

Example 2 with TokenTypeAttribute

use of com.twitter.common.text.token.attribute.TokenTypeAttribute in project commons by twitter.

the class TokenizedCharSequence method createFrom.

public static final TokenizedCharSequence createFrom(TwitterTokenStream tokenizer) {
    CharSequenceTermAttribute termAttr = tokenizer.getAttribute(CharSequenceTermAttribute.class);
    TokenTypeAttribute typeAttr = tokenizer.getAttribute(TokenTypeAttribute.class);
    PartOfSpeechAttribute posAttr = null;
    if (tokenizer.hasAttribute(PartOfSpeechAttribute.class)) {
        posAttr = tokenizer.getAttribute(PartOfSpeechAttribute.class);
    }
    PositionIncrementAttribute incAttr = null;
    if (tokenizer.hasAttribute(PositionIncrementAttribute.class)) {
        incAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
    }
    TokenGroupAttributeImpl groupAttr = null;
    if (tokenizer.hasAttribute(TokenGroupAttribute.class)) {
        groupAttr = (TokenGroupAttributeImpl) tokenizer.getAttribute(TokenGroupAttribute.class);
    }
    // Need to wait for increment token for termAttr to have charsequence properly set
    TokenizedCharSequence.Builder builder = null;
    while (tokenizer.incrementToken()) {
        if (builder == null) {
            // Now we can set the term sequence for the builder.
            builder = new TokenizedCharSequence.Builder(termAttr.getCharSequence());
        }
        builder.addToken(termAttr.getOffset(), termAttr.getLength(), typeAttr.getType(), posAttr == null ? Token.DEFAULT_PART_OF_SPEECH : posAttr.getPOS(), incAttr == null ? 1 : incAttr.getPositionIncrement(), groupAttr == null || groupAttr.isEmpty() ? null : (groupAttr.getSequence() == null ? createFrom(groupAttr.getTokenGroupStream()) : groupAttr.getSequence()));
    }
    if (builder == null) {
        // Never entered tokenizer loop, build an empty string
        builder = new TokenizedCharSequence.Builder("");
    }
    return builder.build();
}
Also used : CharSequenceTermAttribute(com.twitter.common.text.token.attribute.CharSequenceTermAttribute) TokenGroupAttributeImpl(com.twitter.common.text.token.attribute.TokenGroupAttributeImpl) PartOfSpeechAttribute(com.twitter.common.text.token.attribute.PartOfSpeechAttribute) TokenTypeAttribute(com.twitter.common.text.token.attribute.TokenTypeAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 3 with TokenTypeAttribute

use of com.twitter.common.text.token.attribute.TokenTypeAttribute in project commons by twitter.

the class TokenTypeAttributeSerializerTest method serialize.

private byte[] serialize(TokenType tokenType) throws IOException {
    AttributeSource attributeSource = new AttributeSource();
    TokenTypeAttribute tokenTypeAttribute = attributeSource.addAttribute(TokenTypeAttribute.class);
    tokenTypeAttribute.setType(tokenType);
    TokenTypeAttributeSerializer serializer = new TokenTypeAttributeSerializer();
    serializer.initialize(attributeSource, TokenStreamSerializer.CURRENT_VERSION);
    ByteArrayOutputStream output = new ByteArrayOutputStream();
    TokenStreamSerializer.AttributeOutputStream outputStream = new TokenStreamSerializer.AttributeOutputStream(output);
    serializer.serialize(outputStream);
    return output.toByteArray();
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) TokenTypeAttribute(com.twitter.common.text.token.attribute.TokenTypeAttribute) ByteArrayOutputStream(java.io.ByteArrayOutputStream)

Example 4 with TokenTypeAttribute

use of com.twitter.common.text.token.attribute.TokenTypeAttribute in project commons by twitter.

the class TokenTypeAttributeSerializerTest method deserialize.

private TokenType deserialize(byte[] serialized) throws IOException {
    AttributeSource attributeSource = new AttributeSource();
    TokenTypeAttribute tokenTypeAttribute = attributeSource.addAttribute(TokenTypeAttribute.class);
    TokenTypeAttributeSerializer serializer = new TokenTypeAttributeSerializer();
    serializer.initialize(attributeSource, TokenStreamSerializer.CURRENT_VERSION);
    ByteArrayInputStream input = new ByteArrayInputStream(serialized);
    TokenStreamSerializer.AttributeInputStream inputStream = new TokenStreamSerializer.AttributeInputStream(input);
    serializer.deserialize(inputStream, null);
    return tokenTypeAttribute.getType();
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) ByteArrayInputStream(java.io.ByteArrayInputStream) TokenTypeAttribute(com.twitter.common.text.token.attribute.TokenTypeAttribute)

Aggregations

TokenTypeAttribute (com.twitter.common.text.token.attribute.TokenTypeAttribute)4 CharSequenceTermAttribute (com.twitter.common.text.token.attribute.CharSequenceTermAttribute)2 AttributeSource (org.apache.lucene.util.AttributeSource)2 DefaultTextTokenizer (com.twitter.common.text.DefaultTextTokenizer)1 TokenizedCharSequence (com.twitter.common.text.token.TokenizedCharSequence)1 Token (com.twitter.common.text.token.TokenizedCharSequence.Token)1 TwitterTokenStream (com.twitter.common.text.token.TwitterTokenStream)1 PartOfSpeechAttribute (com.twitter.common.text.token.attribute.PartOfSpeechAttribute)1 TokenGroupAttributeImpl (com.twitter.common.text.token.attribute.TokenGroupAttributeImpl)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)1 Attribute (org.apache.lucene.util.Attribute)1