Search in sources :

Example 6 with TwitterTokenStream

use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.

the class PunctuationDetectorTest method testAllPunctuation.

@Test
public void testAllPunctuation() {
    TwitterTokenStream regexTokenizerStream = new RegexTokenizer.Builder().setDelimiterPattern(Pattern.compile(" ")).build();
    regexTokenizerStream.reset("When I was young , I liked insects .");
    PunctuationDetector stream = new PunctuationDetector.Builder(regexTokenizerStream).build();
    // Variations of middle dots.
    stream.reset("· · • ∙ ⋅ ・ ・ ● ○ ◎");
    int cnt = 0;
    while (stream.incrementToken()) {
        assertEquals(TokenType.PUNCTUATION, stream.type());
        cnt++;
    }
    assertEquals(10, cnt);
}
Also used : TwitterTokenStream(com.twitter.common.text.token.TwitterTokenStream) Test(org.junit.Test)

Example 7 with TwitterTokenStream

use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.

the class TokenStreamSerializer method deserialize.

/**
 * Other form of deserialize for a ByteArrayInputStream.
 */
public final TwitterTokenStream deserialize(final ByteArrayInputStream bais, final CharSequence charSequence) throws IOException {
    final AttributeInputStream input = new AttributeInputStream(bais);
    TwitterTokenStream twitterTokenStream = new TwitterTokenStream() {

        CharSequence chars = charSequence;

        // All other members are initialized in reset.
        int token;

        Version version;

        int numTokens;

        @Override
        public final boolean incrementToken() {
            if (token < numTokens) {
                token++;
                try {
                    deserializeAttributes(input, chars);
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
                return true;
            }
            return false;
        }

        @Override
        public void reset() {
            CharSequence newChars = inputCharSequence();
            Preconditions.checkArgument(newChars == null || newChars == chars, "this TwitterTokenStream does not do actual tokenization and only supports reset(null)");
            try {
                input.reset();
                bais.reset();
                version = readVersionAndCheckFingerprint(input, attributeSerializersFingerprint);
                numTokens = input.readVInt();
                for (AttributeSerializer deserializer : attributeSerializers) {
                    deserializer.initialize(this, version);
                }
            } catch (IOException e) {
                throw new IllegalStateException("Unexpected exception, but...", e);
            }
            token = 0;
        }
    };
    twitterTokenStream.reset(null);
    return twitterTokenStream;
}
Also used : TwitterTokenStream(com.twitter.common.text.token.TwitterTokenStream) IOException(java.io.IOException)

Example 8 with TwitterTokenStream

use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.

the class PunctuationExceptionCombinerTest method testAddingPunctuationExceptions.

@Test
public void testAddingPunctuationExceptions() {
    TwitterTokenStream stream = new PunctuationExceptionCombiner.Builder(tokenized).addExceptionChars(".").build();
    stream.reset("I .. exceptions!! ");
    assertEquals(ImmutableList.of("I", "..", "exceptions", "!", "!"), stream.toStringList());
    stream = new PunctuationExceptionCombiner.Builder(tokenized).addExceptionChars(".!").build();
    stream.reset("I ..♥♥ exceptions!! ");
    assertEquals(ImmutableList.of("I", "..♥♥", "exceptions", "!!"), stream.toStringList());
}
Also used : TwitterTokenStream(com.twitter.common.text.token.TwitterTokenStream) Test(org.junit.Test)

Example 9 with TwitterTokenStream

use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.

the class PunctuationDetectorTest method testCombiningMarks.

@Test
public void testCombiningMarks() {
    TwitterTokenStream regexTokenizerStream = new RegexTokenizer.Builder().setDelimiterPattern(Pattern.compile(" ")).build();
    // escaped sequence for "word ́ ̋ ̔ ̛ ̧ word ِ ٓ ा ै ิ word"
    regexTokenizerStream.reset("word \u0301 \u030b \u0314 \u031b \u0327 word \u0650 \u0653 \u093e \u0948 \u0e34 word");
    // Test behavior with regard to combining marks in various languages
    PunctuationDetector stream = new PunctuationDetector.Builder(regexTokenizerStream).build();
    int cnt = 0;
    while (stream.incrementToken()) {
        if ("word".equals(stream.term().toString())) {
            assertFalse(TokenType.PUNCTUATION.equals(stream.type()));
        } else {
            assertEquals(TokenType.PUNCTUATION, stream.type());
        }
        cnt++;
    }
    assertEquals(13, cnt);
    // Test with combining marks not treated as punctuation
    regexTokenizerStream.reset();
    stream = new PunctuationDetector.Builder(regexTokenizerStream).useCombiningMarks(false).build();
    cnt = 0;
    while (stream.incrementToken()) {
        assertFalse(TokenType.PUNCTUATION.equals(stream.type()));
        cnt++;
    }
    assertEquals(13, cnt);
}
Also used : TwitterTokenStream(com.twitter.common.text.token.TwitterTokenStream) Test(org.junit.Test)

Example 10 with TwitterTokenStream

use of com.twitter.common.text.token.TwitterTokenStream in project commons by twitter.

the class PunctuationDetectorTest method testNoPunctuationDetector.

@Test
public void testNoPunctuationDetector() {
    // This test case shows that, without a punctuation detector, the punctuation characters do not
    // have the correct token type.
    TwitterTokenStream stream = new RegexTokenizer.Builder().setDelimiterPattern(Pattern.compile(" ")).build();
    stream.reset("When I was young , I liked insects .");
    int cnt = 0;
    while (stream.incrementToken()) {
        assertFalse(TokenType.PUNCTUATION.equals(stream.type()));
        cnt++;
    }
    // Make sure we've consumed the correct number of tokens.
    assertEquals(9, cnt);
}
Also used : TwitterTokenStream(com.twitter.common.text.token.TwitterTokenStream) Test(org.junit.Test)

Aggregations

TwitterTokenStream (com.twitter.common.text.token.TwitterTokenStream)13 Test (org.junit.Test)10 DefaultTextTokenizer (com.twitter.common.text.DefaultTextTokenizer)3 TextTokenizer (com.twitter.common.text.TextTokenizer)2 PunctuationFilter (com.twitter.common.text.filter.PunctuationFilter)1 TokenizedCharSequence (com.twitter.common.text.token.TokenizedCharSequence)1 Token (com.twitter.common.text.token.TokenizedCharSequence.Token)1 CharSequenceTermAttribute (com.twitter.common.text.token.attribute.CharSequenceTermAttribute)1 TokenTypeAttribute (com.twitter.common.text.token.attribute.TokenTypeAttribute)1 IOException (java.io.IOException)1 Attribute (org.apache.lucene.util.Attribute)1