Search in sources :

Example 86 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestPhoneticFilter method testRandomStrings.

/** blast some random strings through the analyzer */
public void testRandomStrings() throws IOException {
    Encoder[] encoders = new Encoder[] { new Metaphone(), new DoubleMetaphone(), new Soundex(), new RefinedSoundex(), new Caverphone2() };
    for (final Encoder e : encoders) {
        Analyzer a = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
                return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false));
            }
        };
        checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
        a.close();
        Analyzer b = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
                return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false));
            }
        };
        checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER);
        b.close();
    }
}
Also used : RefinedSoundex(org.apache.commons.codec.language.RefinedSoundex) DoubleMetaphone(org.apache.commons.codec.language.DoubleMetaphone) Metaphone(org.apache.commons.codec.language.Metaphone) Caverphone2(org.apache.commons.codec.language.Caverphone2) Analyzer(org.apache.lucene.analysis.Analyzer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) RefinedSoundex(org.apache.commons.codec.language.RefinedSoundex) Soundex(org.apache.commons.codec.language.Soundex) DoubleMetaphone(org.apache.commons.codec.language.DoubleMetaphone) Encoder(org.apache.commons.codec.Encoder) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 87 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestBeiderMorseFilter method testCustomAttribute.

public void testCustomAttribute() throws IOException {
    TokenStream stream = new MockTokenizer(MockTokenizer.KEYWORD, false);
    ((Tokenizer) stream).setReader(new StringReader("D'Angelo"));
    stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*"));
    stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true));
    KeywordAttribute keyAtt = stream.addAttribute(KeywordAttribute.class);
    stream.reset();
    int i = 0;
    while (stream.incrementToken()) {
        assertTrue(keyAtt.isKeyword());
        i++;
    }
    assertEquals(12, i);
    stream.end();
    stream.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) PhoneticEngine(org.apache.commons.codec.language.bm.PhoneticEngine) TokenStream(org.apache.lucene.analysis.TokenStream) KeywordAttribute(org.apache.lucene.analysis.tokenattributes.KeywordAttribute) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) PatternKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.PatternKeywordMarkerFilter)

Example 88 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestDaitchMokotoffSoundexFilter method testEmptyTerm.

public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer, new DaitchMokotoffSoundexFilter(tokenizer, random().nextBoolean()));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 89 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class SmartChineseAnalyzer method createComponents.

@Override
public TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer tokenizer = new HMMChineseTokenizer();
    TokenStream result = tokenizer;
    // result = new LowerCaseFilter(result);
    // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
    // The porter stemming is too strict, this is not a bug, this is a feature:)
    result = new PorterStemFilter(result);
    if (!stopWords.isEmpty()) {
        result = new StopFilter(result, stopWords);
    }
    return new TokenStreamComponents(tokenizer, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StopFilter(org.apache.lucene.analysis.StopFilter) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 90 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestPhoneticFilterFactory method assertAlgorithm.

static void assertAlgorithm(String algName, String inject, String input, String[] expected) throws Exception {
    Tokenizer tokenizer = whitespaceMockTokenizer(input);
    Map<String, String> args = new HashMap<>();
    args.put("encoder", algName);
    args.put("inject", inject);
    PhoneticFilterFactory factory = new PhoneticFilterFactory(args);
    factory.inform(new ClasspathResourceLoader(factory.getClass()));
    TokenStream stream = factory.create(tokenizer);
    assertTokenStreamContents(stream, expected);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) ClasspathResourceLoader(org.apache.lucene.analysis.util.ClasspathResourceLoader) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Aggregations

Tokenizer (org.apache.lucene.analysis.Tokenizer)573 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)286 Analyzer (org.apache.lucene.analysis.Analyzer)265 StringReader (java.io.StringReader)249 TokenStream (org.apache.lucene.analysis.TokenStream)227 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)216 Reader (java.io.Reader)91 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)67 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)63 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)52 StopFilter (org.apache.lucene.analysis.StopFilter)48 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)47 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)45 CharArraySet (org.apache.lucene.analysis.CharArraySet)43 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)36 ESTestCase (org.elasticsearch.test.ESTestCase)30 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)26 HashMap (java.util.HashMap)23 Random (java.util.Random)20 TokenFilter (org.apache.lucene.analysis.TokenFilter)19