Search in sources :

Example 81 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestRussianLightStemFilter method testKeyword.

public void testKeyword() throws IOException {
    final CharArraySet exclusionSet = new CharArraySet(asSet("энергии"), false);
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
            return new TokenStreamComponents(source, new RussianLightStemFilter(sink));
        }
    };
    checkOneTerm(a, "энергии", "энергии");
    a.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 82 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class ShingleFilterTest method testRandomHugeStrings.

/** blast some random large strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
    Random random = random();
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer));
        }
    };
    checkRandomData(random, a, 100 * RANDOM_MULTIPLIER, 8192);
    a.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Random(java.util.Random) Analyzer(org.apache.lucene.analysis.Analyzer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 83 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestDaitchMokotoffSoundexFilter method testRandomStrings.

/** blast some random strings through the analyzer */
public void testRandomStrings() throws IOException {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, false));
        }
    };
    checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
    a.close();
    Analyzer b = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, false));
        }
    };
    checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER);
    b.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Analyzer(org.apache.lucene.analysis.Analyzer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 84 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestDaitchMokotoffSoundexFilterFactory method testSettingInject.

public void testSettingInject() throws Exception {
    Map<String, String> parameters = new HashMap<>();
    parameters.put("inject", "false");
    DaitchMokotoffSoundexFilterFactory factory = new DaitchMokotoffSoundexFilterFactory(parameters);
    Tokenizer inputStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    inputStream.setReader(new StringReader("international"));
    TokenStream filteredStream = factory.create(inputStream);
    assertEquals(DaitchMokotoffSoundexFilter.class, filteredStream.getClass());
    assertTokenStreamContents(filteredStream, new String[] { "063963" });
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer)

Example 85 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestPhoneticFilter method assertAlgorithm.

static void assertAlgorithm(Encoder encoder, boolean inject, String input, String[] expected) throws Exception {
    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    tokenizer.setReader(new StringReader(input));
    PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject);
    assertTokenStreamContents(filter, expected);
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Aggregations

Tokenizer (org.apache.lucene.analysis.Tokenizer)573 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)286 Analyzer (org.apache.lucene.analysis.Analyzer)265 StringReader (java.io.StringReader)249 TokenStream (org.apache.lucene.analysis.TokenStream)227 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)216 Reader (java.io.Reader)91 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)67 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)63 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)52 StopFilter (org.apache.lucene.analysis.StopFilter)48 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)47 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)45 CharArraySet (org.apache.lucene.analysis.CharArraySet)43 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)36 ESTestCase (org.elasticsearch.test.ESTestCase)30 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)26 HashMap (java.util.HashMap)23 Random (java.util.Random)20 TokenFilter (org.apache.lucene.analysis.TokenFilter)19