use of com.twitter.common.text.TextTokenizer in project commons by twitter.
the class TokenStreamSerializerTest method testIncompatibleStreams.
@Test
public void testIncompatibleStreams() throws Exception {
final String text = "Test that incompatible streams are - actually -incompatible.";
TextTokenizer tokenizer = new DefaultTextTokenizer.Builder().setKeepPunctuation(false).build();
TwitterTokenStream stream = tokenizer.getDefaultTokenStream();
stream.reset(text);
TokenStreamSerializer serializer = TokenStreamSerializer.builder().add(new CharSequenceTermAttributeSerializer()).add(new TokenTypeAttributeSerializer()).add(new PositionIncrementAttributeSerializer()).build();
byte[] data = serializer.serialize(stream);
// Notice that I just flipped two serializers.
TokenStreamSerializer incompatibleSerializer = TokenStreamSerializer.builder().add(new CharSequenceTermAttributeSerializer()).add(new PositionIncrementAttributeSerializer()).add(new TokenTypeAttributeSerializer()).build();
boolean exceptionWasThrown = false;
try {
incompatibleSerializer.deserialize(data, text);
} catch (TokenStreamSerializer.VersionMismatchException e) {
exceptionWasThrown = true;
}
assertTrue("The expected exception was not thrown!", exceptionWasThrown);
}
use of com.twitter.common.text.TextTokenizer in project commons by twitter.
the class TokenStreamSerializerTest method testSerialization.
@Test
public void testSerialization() throws Exception {
final String text = "Hello, this is a test";
TextTokenizer tokenizer = new DefaultTextTokenizer.Builder().setKeepPunctuation(false).build();
TwitterTokenStream stream = tokenizer.getDefaultTokenStream();
stream.reset(text);
TokenStreamSerializer serializer = TokenStreamSerializer.builder().add(new CharSequenceTermAttributeSerializer()).add(new TokenTypeAttributeSerializer()).add(new PositionIncrementAttributeSerializer()).build();
byte[] data = serializer.serialize(stream);
TwitterTokenStream deserialized = serializer.deserialize(data, text);
for (int i = 0; i < 2; ++i) {
// run this twice so that we see that resetting we still get the same tokens.
stream.reset(text);
while (stream.incrementToken()) {
assertTrue(deserialized.incrementToken());
assertEquals(stream.reflectAsString(true), deserialized.reflectAsString(true));
}
assertFalse(deserialized.incrementToken());
deserialized.reset(null);
}
}
Aggregations