use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.
the class PunctuationDetectorTest method testAllPunctuation.
@Test
public void testAllPunctuation() {
TwitterTokenStream regexTokenizerStream = new RegexTokenizer.Builder().setDelimiterPattern(Pattern.compile(" ")).build();
regexTokenizerStream.reset("When I was young , I liked insects .");
PunctuationDetector stream = new PunctuationDetector.Builder(regexTokenizerStream).build();
// Variations of middle dots.
stream.reset("· · • ∙ ⋅ ・ ・ ● ○ ◎");
int cnt = 0;
while (stream.incrementToken()) {
assertEquals(TokenType.PUNCTUATION, stream.type());
cnt++;
}
assertEquals(10, cnt);
}
use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.
the class TokenStreamSerializer method deserialize.
/**
* Other form of deserialize for a ByteArrayInputStream.
*/
public final TwitterTokenStream deserialize(final ByteArrayInputStream bais, final CharSequence charSequence) throws IOException {
final AttributeInputStream input = new AttributeInputStream(bais);
TwitterTokenStream twitterTokenStream = new TwitterTokenStream() {
CharSequence chars = charSequence;
// All other members are initialized in reset.
int token;
Version version;
int numTokens;
@Override
public final boolean incrementToken() {
if (token < numTokens) {
token++;
try {
deserializeAttributes(input, chars);
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
}
return false;
}
@Override
public void reset() {
CharSequence newChars = inputCharSequence();
Preconditions.checkArgument(newChars == null || newChars == chars, "this TwitterTokenStream does not do actual tokenization and only supports reset(null)");
try {
input.reset();
bais.reset();
version = readVersionAndCheckFingerprint(input, attributeSerializersFingerprint);
numTokens = input.readVInt();
for (AttributeSerializer deserializer : attributeSerializers) {
deserializer.initialize(this, version);
}
} catch (IOException e) {
throw new IllegalStateException("Unexpected exception, but...", e);
}
token = 0;
}
};
twitterTokenStream.reset(null);
return twitterTokenStream;
}
use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.
the class PunctuationExceptionCombinerTest method testAddingPunctuationExceptions.
@Test
public void testAddingPunctuationExceptions() {
TwitterTokenStream stream = new PunctuationExceptionCombiner.Builder(tokenized).addExceptionChars(".").build();
stream.reset("I .. exceptions!! ");
assertEquals(ImmutableList.of("I", "..", "exceptions", "!", "!"), stream.toStringList());
stream = new PunctuationExceptionCombiner.Builder(tokenized).addExceptionChars(".!").build();
stream.reset("I ..♥♥ exceptions!! ");
assertEquals(ImmutableList.of("I", "..♥♥", "exceptions", "!!"), stream.toStringList());
}
use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.
the class PunctuationDetectorTest method testCombiningMarks.
@Test
public void testCombiningMarks() {
TwitterTokenStream regexTokenizerStream = new RegexTokenizer.Builder().setDelimiterPattern(Pattern.compile(" ")).build();
// escaped sequence for "word ́ ̋ ̔ ̛ ̧ word ِ ٓ ा ै ิ word"
regexTokenizerStream.reset("word \u0301 \u030b \u0314 \u031b \u0327 word \u0650 \u0653 \u093e \u0948 \u0e34 word");
// Test behavior with regard to combining marks in various languages
PunctuationDetector stream = new PunctuationDetector.Builder(regexTokenizerStream).build();
int cnt = 0;
while (stream.incrementToken()) {
if ("word".equals(stream.term().toString())) {
assertFalse(TokenType.PUNCTUATION.equals(stream.type()));
} else {
assertEquals(TokenType.PUNCTUATION, stream.type());
}
cnt++;
}
assertEquals(13, cnt);
// Test with combining marks not treated as punctuation
regexTokenizerStream.reset();
stream = new PunctuationDetector.Builder(regexTokenizerStream).useCombiningMarks(false).build();
cnt = 0;
while (stream.incrementToken()) {
assertFalse(TokenType.PUNCTUATION.equals(stream.type()));
cnt++;
}
assertEquals(13, cnt);
}
use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.
the class PunctuationDetectorTest method testNoPunctuationDetector.
@Test
public void testNoPunctuationDetector() {
// This test case shows that, without a punctuation detector, the punctuation characters do not
// have the correct token type.
TwitterTokenStream stream = new RegexTokenizer.Builder().setDelimiterPattern(Pattern.compile(" ")).build();
stream.reset("When I was young , I liked insects .");
int cnt = 0;
while (stream.incrementToken()) {
assertFalse(TokenType.PUNCTUATION.equals(stream.type()));
cnt++;
}
// Make sure we've consumed the correct number of tokens.
assertEquals(9, cnt);
}
Aggregations