Search in sources :

Example 86 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class TestCodepointCountFilter method testRandomStrings.

public void testRandomStrings() throws IOException {
    for (int i = 0; i < 10000; i++) {
        String text = TestUtil.randomUnicodeString(random(), 100);
        int min = TestUtil.nextInt(random(), 0, 100);
        int max = TestUtil.nextInt(random(), 0, 100);
        int count = text.codePointCount(0, text.length());
        if (min > max) {
            int temp = min;
            min = max;
            max = temp;
        }
        boolean expected = count >= min && count <= max;
        TokenStream stream = new KeywordTokenizer();
        ((Tokenizer) stream).setReader(new StringReader(text));
        stream = new CodepointCountFilter(stream, min, max);
        stream.reset();
        assertEquals(expected, stream.incrementToken());
        stream.end();
        stream.close();
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StringReader(java.io.StringReader) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 87 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class TestLengthFilter method testEmptyTerm.

public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer, new LengthFilter(tokenizer, 0, 5));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 88 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class TestSerbianNormalizationRegularFilter method testEmptyTerm.

public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer, new SerbianNormalizationRegularFilter(tokenizer));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 89 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class TestCompoundWordTokenFilter method testEmptyTerm.

public void testEmptyTerm() throws Exception {
    final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
    InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
    final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
    Analyzer b = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
    checkOneTerm(b, "", "");
    b.close();
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) InputSource(org.xml.sax.InputSource) HyphenationTree(org.apache.lucene.analysis.compound.hyphenation.HyphenationTree) Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) TokenFilter(org.apache.lucene.analysis.TokenFilter)

Example 90 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class TestCJKWidthFilter method testEmptyTerm.

public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer, new CJKWidthFilter(tokenizer));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Aggregations

KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)95 Tokenizer (org.apache.lucene.analysis.Tokenizer)86 Analyzer (org.apache.lucene.analysis.Analyzer)75 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)64 TokenStream (org.apache.lucene.analysis.TokenStream)14 StringReader (java.io.StringReader)11 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)11 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)4 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)4 Random (java.util.Random)3 CharArraySet (org.apache.lucene.analysis.CharArraySet)3 LetterTokenizer (org.apache.lucene.analysis.core.LetterTokenizer)3 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)3 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)3 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)3 Transliterator (com.ibm.icu.text.Transliterator)2 UnicodeSet (com.ibm.icu.text.UnicodeSet)2 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 LowerCaseTokenizer (org.apache.lucene.analysis.core.LowerCaseTokenizer)2 RemoveDuplicatesTokenFilter (org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter)2