use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.
the class TokenizerUsageExample method main.
public static void main(String[] args) {
// This is the canonical way to create a token stream.
DefaultTextTokenizer tokenizer = new DefaultTextTokenizer.Builder().setKeepPunctuation(true).build();
TwitterTokenStream stream = tokenizer.getDefaultTokenStream();
// We're going to ask the token stream what type of attributes it makes available. "Attributes"
// can be understood as "annotations" on the original text.
System.out.println("Attributes available:");
Iterator<Class<? extends Attribute>> iter = stream.getAttributeClassesIterator();
while (iter.hasNext()) {
Class<? extends Attribute> c = iter.next();
System.out.println(" - " + c.getCanonicalName());
}
System.out.println("");
// We're now going to iterate through a few tweets and tokenize each in turn.
for (String tweet : famousTweets) {
// We're first going to demonstrate the "token-by-token" method of consuming tweets.
System.out.println("Processing: " + tweet);
// Reset the token stream to process new input.
stream.reset(tweet);
// Now we're going to consume tokens from the stream.
int tokenCnt = 0;
while (stream.incrementToken()) {
// CharSequenceTermAttribute holds the actual token text. This is preferred over
// TermAttribute because it avoids creating new String objects.
CharSequenceTermAttribute termAttribute = stream.getAttribute(CharSequenceTermAttribute.class);
// TokenTypeAttribute holds, as you'd expect, the type of the token.
TokenTypeAttribute typeAttribute = stream.getAttribute(TokenTypeAttribute.class);
System.out.println(String.format("token %2d (%3d, %3d) type: %12s, token: '%s'", tokenCnt, termAttribute.getOffset(), termAttribute.getLength() - termAttribute.getOffset(), typeAttribute.getType().name, termAttribute.getTermCharSequence()));
tokenCnt++;
}
System.out.println("");
// We're now going to demonstrate the TokenizedCharSequence API.
// This should produce exactly the same result as above.
tokenCnt = 0;
System.out.println("Processing: " + tweet);
TokenizedCharSequence tokSeq = tokenizer.tokenize(tweet);
for (Token tok : tokSeq.getTokens()) {
System.out.println(String.format("token %2d (%3d, %3d) type: %12s, token: '%s'", tokenCnt, tok.getOffset(), tok.getOffset() + tok.getLength(), tok.getType().name, tok.getTerm()));
tokenCnt++;
}
System.out.println("");
}
}
use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.
the class TextTokenizer method tokenizeToStrings.
/**
* Tokenizes a {@code CharSequence} into a list of Strings.
*
* @param input text to be tokenized
* @return a list of tokens as String objects
*/
public List<String> tokenizeToStrings(CharSequence input) {
Preconditions.checkNotNull(input);
TwitterTokenStream tokenizer = getDefaultTokenStream();
tokenizer.reset(input);
return tokenizer.toStringList();
}
use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.
the class TokenStreamSerializerTest method testIncompatibleStreams.
@Test
public void testIncompatibleStreams() throws Exception {
final String text = "Test that incompatible streams are - actually -incompatible.";
TextTokenizer tokenizer = new DefaultTextTokenizer.Builder().setKeepPunctuation(false).build();
TwitterTokenStream stream = tokenizer.getDefaultTokenStream();
stream.reset(text);
TokenStreamSerializer serializer = TokenStreamSerializer.builder().add(new CharSequenceTermAttributeSerializer()).add(new TokenTypeAttributeSerializer()).add(new PositionIncrementAttributeSerializer()).build();
byte[] data = serializer.serialize(stream);
// Notice that I just flipped two serializers.
TokenStreamSerializer incompatibleSerializer = TokenStreamSerializer.builder().add(new CharSequenceTermAttributeSerializer()).add(new PositionIncrementAttributeSerializer()).add(new TokenTypeAttributeSerializer()).build();
boolean exceptionWasThrown = false;
try {
incompatibleSerializer.deserialize(data, text);
} catch (TokenStreamSerializer.VersionMismatchException e) {
exceptionWasThrown = true;
}
assertTrue("The expected exception was not thrown!", exceptionWasThrown);
}
use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.
the class PunctuationExceptionCombinerTest method testPunctuationExceptions.
@Test
public void testPunctuationExceptions() {
TwitterTokenStream stream = new PunctuationExceptionCombiner.Builder(tokenized).build();
stream.reset("I .. exceptions!! ");
assertEquals(ImmutableList.of("I", ".", ".", "exceptions", "!", "!"), stream.toStringList());
stream.reset("I ♥♥ exceptions");
assertEquals(ImmutableList.of("I", "♥♥", "exceptions"), stream.toStringList());
stream.reset("I .♥♥. exceptions");
assertEquals(ImmutableList.of("I", ".", "♥♥", ".", "exceptions"), stream.toStringList());
}
use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.
the class PunctuationExceptionCombinerTest method testPunctuationFilterDoesNotRemoveExceptionChars.
@Test
public void testPunctuationFilterDoesNotRemoveExceptionChars() {
TwitterTokenStream stream = new PunctuationFilter(new PunctuationExceptionCombiner.Builder(tokenized).build());
stream.reset("I .. exceptions!! ");
assertEquals(ImmutableList.of("I", "exceptions"), stream.toStringList());
stream.reset("I ♥♥ exceptions!!");
assertEquals(ImmutableList.of("I", "♥♥", "exceptions"), stream.toStringList());
}
Aggregations