use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.
the class PunctuationDetectorTest method testPunctuationDetector.
@Test
public void testPunctuationDetector() {
// Compare with testNoPunctuationDetector(): now we add a punctuation detector, and the
// punctuation characters have the correct types.
TwitterTokenStream regexTokenizerStream = new RegexTokenizer.Builder().setDelimiterPattern(Pattern.compile(" ")).build();
regexTokenizerStream.reset("When I was young , I liked insects .");
PunctuationDetector stream = new PunctuationDetector.Builder(regexTokenizerStream).build();
int cnt = 0;
while (stream.incrementToken()) {
String token = stream.term().toString();
if (",".equals(token) || ".".equals(token)) {
assertEquals(TokenType.PUNCTUATION, stream.type());
} else {
assertFalse(TokenType.PUNCTUATION.equals(stream.type()));
}
cnt++;
}
assertEquals(9, cnt);
// Additional examples in jp:
stream.reset("「 今日 は いい 天気 、 明日 も いい 天気 。 」");
cnt = 0;
while (stream.incrementToken()) {
if (ImmutableSet.of("[", "、", "。", "」").contains(stream.term().toString())) {
assertEquals(stream.type(), TokenType.PUNCTUATION);
} else {
assertFalse(TokenType.PUNCTUATION.equals(stream.type()));
}
cnt++;
}
assertEquals(11, cnt);
}
use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.
the class PunctuationDetectorTest method testNewlineIsPunctuation.
@Test
public void testNewlineIsPunctuation() {
TwitterTokenStream regexTokenizerStream = new RegexTokenizer.Builder().setDelimiterPattern(Pattern.compile(" ")).build();
regexTokenizerStream.reset("Newline \n as punctuation");
PunctuationDetector stream = new PunctuationDetector.Builder(regexTokenizerStream).build();
int cnt = 0;
while (stream.incrementToken()) {
if (stream.term().toString().equals("\n")) {
cnt++;
assertEquals(TokenType.PUNCTUATION, stream.type());
}
}
assertEquals(1, cnt);
}
use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.
the class TokenStreamSerializerTest method testSerialization.
@Test
public void testSerialization() throws Exception {
final String text = "Hello, this is a test";
TextTokenizer tokenizer = new DefaultTextTokenizer.Builder().setKeepPunctuation(false).build();
TwitterTokenStream stream = tokenizer.getDefaultTokenStream();
stream.reset(text);
TokenStreamSerializer serializer = TokenStreamSerializer.builder().add(new CharSequenceTermAttributeSerializer()).add(new TokenTypeAttributeSerializer()).add(new PositionIncrementAttributeSerializer()).build();
byte[] data = serializer.serialize(stream);
TwitterTokenStream deserialized = serializer.deserialize(data, text);
for (int i = 0; i < 2; ++i) {
// run this twice so that we see that resetting we still get the same tokens.
stream.reset(text);
while (stream.incrementToken()) {
assertTrue(deserialized.incrementToken());
assertEquals(stream.reflectAsString(true), deserialized.reflectAsString(true));
}
assertFalse(deserialized.incrementToken());
deserialized.reset(null);
}
}
Aggregations