use of com.twitter.common.text.token.attribute.TokenTypeAttribute in project commons by twitter.
the class TokenizerUsageExample method main.
public static void main(String[] args) {
// This is the canonical way to create a token stream.
DefaultTextTokenizer tokenizer = new DefaultTextTokenizer.Builder().setKeepPunctuation(true).build();
TwitterTokenStream stream = tokenizer.getDefaultTokenStream();
// We're going to ask the token stream what type of attributes it makes available. "Attributes"
// can be understood as "annotations" on the original text.
System.out.println("Attributes available:");
Iterator<Class<? extends Attribute>> iter = stream.getAttributeClassesIterator();
while (iter.hasNext()) {
Class<? extends Attribute> c = iter.next();
System.out.println(" - " + c.getCanonicalName());
}
System.out.println("");
// We're now going to iterate through a few tweets and tokenize each in turn.
for (String tweet : famousTweets) {
// We're first going to demonstrate the "token-by-token" method of consuming tweets.
System.out.println("Processing: " + tweet);
// Reset the token stream to process new input.
stream.reset(tweet);
// Now we're going to consume tokens from the stream.
int tokenCnt = 0;
while (stream.incrementToken()) {
// CharSequenceTermAttribute holds the actual token text. This is preferred over
// TermAttribute because it avoids creating new String objects.
CharSequenceTermAttribute termAttribute = stream.getAttribute(CharSequenceTermAttribute.class);
// TokenTypeAttribute holds, as you'd expect, the type of the token.
TokenTypeAttribute typeAttribute = stream.getAttribute(TokenTypeAttribute.class);
System.out.println(String.format("token %2d (%3d, %3d) type: %12s, token: '%s'", tokenCnt, termAttribute.getOffset(), termAttribute.getLength() - termAttribute.getOffset(), typeAttribute.getType().name, termAttribute.getTermCharSequence()));
tokenCnt++;
}
System.out.println("");
// We're now going to demonstrate the TokenizedCharSequence API.
// This should produce exactly the same result as above.
tokenCnt = 0;
System.out.println("Processing: " + tweet);
TokenizedCharSequence tokSeq = tokenizer.tokenize(tweet);
for (Token tok : tokSeq.getTokens()) {
System.out.println(String.format("token %2d (%3d, %3d) type: %12s, token: '%s'", tokenCnt, tok.getOffset(), tok.getOffset() + tok.getLength(), tok.getType().name, tok.getTerm()));
tokenCnt++;
}
System.out.println("");
}
}
use of com.twitter.common.text.token.attribute.TokenTypeAttribute in project commons by twitter.
the class TokenizedCharSequence method createFrom.
public static final TokenizedCharSequence createFrom(TwitterTokenStream tokenizer) {
CharSequenceTermAttribute termAttr = tokenizer.getAttribute(CharSequenceTermAttribute.class);
TokenTypeAttribute typeAttr = tokenizer.getAttribute(TokenTypeAttribute.class);
PartOfSpeechAttribute posAttr = null;
if (tokenizer.hasAttribute(PartOfSpeechAttribute.class)) {
posAttr = tokenizer.getAttribute(PartOfSpeechAttribute.class);
}
PositionIncrementAttribute incAttr = null;
if (tokenizer.hasAttribute(PositionIncrementAttribute.class)) {
incAttr = tokenizer.getAttribute(PositionIncrementAttribute.class);
}
TokenGroupAttributeImpl groupAttr = null;
if (tokenizer.hasAttribute(TokenGroupAttribute.class)) {
groupAttr = (TokenGroupAttributeImpl) tokenizer.getAttribute(TokenGroupAttribute.class);
}
// Need to wait for increment token for termAttr to have charsequence properly set
TokenizedCharSequence.Builder builder = null;
while (tokenizer.incrementToken()) {
if (builder == null) {
// Now we can set the term sequence for the builder.
builder = new TokenizedCharSequence.Builder(termAttr.getCharSequence());
}
builder.addToken(termAttr.getOffset(), termAttr.getLength(), typeAttr.getType(), posAttr == null ? Token.DEFAULT_PART_OF_SPEECH : posAttr.getPOS(), incAttr == null ? 1 : incAttr.getPositionIncrement(), groupAttr == null || groupAttr.isEmpty() ? null : (groupAttr.getSequence() == null ? createFrom(groupAttr.getTokenGroupStream()) : groupAttr.getSequence()));
}
if (builder == null) {
// Never entered tokenizer loop, build an empty string
builder = new TokenizedCharSequence.Builder("");
}
return builder.build();
}
use of com.twitter.common.text.token.attribute.TokenTypeAttribute in project commons by twitter.
the class TokenTypeAttributeSerializerTest method serialize.
private byte[] serialize(TokenType tokenType) throws IOException {
AttributeSource attributeSource = new AttributeSource();
TokenTypeAttribute tokenTypeAttribute = attributeSource.addAttribute(TokenTypeAttribute.class);
tokenTypeAttribute.setType(tokenType);
TokenTypeAttributeSerializer serializer = new TokenTypeAttributeSerializer();
serializer.initialize(attributeSource, TokenStreamSerializer.CURRENT_VERSION);
ByteArrayOutputStream output = new ByteArrayOutputStream();
TokenStreamSerializer.AttributeOutputStream outputStream = new TokenStreamSerializer.AttributeOutputStream(output);
serializer.serialize(outputStream);
return output.toByteArray();
}
use of com.twitter.common.text.token.attribute.TokenTypeAttribute in project commons by twitter.
the class TokenTypeAttributeSerializerTest method deserialize.
private TokenType deserialize(byte[] serialized) throws IOException {
AttributeSource attributeSource = new AttributeSource();
TokenTypeAttribute tokenTypeAttribute = attributeSource.addAttribute(TokenTypeAttribute.class);
TokenTypeAttributeSerializer serializer = new TokenTypeAttributeSerializer();
serializer.initialize(attributeSource, TokenStreamSerializer.CURRENT_VERSION);
ByteArrayInputStream input = new ByteArrayInputStream(serialized);
TokenStreamSerializer.AttributeInputStream inputStream = new TokenStreamSerializer.AttributeInputStream(input);
serializer.deserialize(inputStream, null);
return tokenTypeAttribute.getType();
}
Aggregations