Search in sources :

Example 1 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project elasticsearch by elastic.

the class SimpleIcuNormalizerCharFilterTests method testDefaultSetting.

public void testDefaultSetting() throws Exception {
    Settings settings = Settings.builder().put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer").build();
    TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());
    CharFilterFactory charFilterFactory = analysis.charFilter.get("myNormalizerChar");
    String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
    Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
    String expectedOutput = normalizer.normalize(input);
    CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
    char[] tempBuff = new char[10];
    StringBuilder output = new StringBuilder();
    while (true) {
        int length = inputReader.read(tempBuff);
        if (length == -1)
            break;
        output.append(tempBuff, 0, length);
        assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
    }
    assertEquals(expectedOutput, output.toString());
}
Also used : Normalizer2(com.ibm.icu.text.Normalizer2) CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader) Index(org.elasticsearch.index.Index) AnalysisICUPlugin(org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin) Settings(org.elasticsearch.common.settings.Settings)

Example 2 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project elasticsearch by elastic.

the class SimpleIcuNormalizerCharFilterTests method testNameAndModeSetting.

public void testNameAndModeSetting() throws Exception {
    Settings settings = Settings.builder().put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer").put("index.analysis.char_filter.myNormalizerChar.name", "nfkc").put("index.analysis.char_filter.myNormalizerChar.mode", "decompose").build();
    TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());
    CharFilterFactory charFilterFactory = analysis.charFilter.get("myNormalizerChar");
    String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
    Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
    String expectedOutput = normalizer.normalize(input);
    CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
    char[] tempBuff = new char[10];
    StringBuilder output = new StringBuilder();
    while (true) {
        int length = inputReader.read(tempBuff);
        if (length == -1)
            break;
        output.append(tempBuff, 0, length);
        assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
    }
    assertEquals(expectedOutput, output.toString());
}
Also used : Normalizer2(com.ibm.icu.text.Normalizer2) CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader) Index(org.elasticsearch.index.Index) AnalysisICUPlugin(org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin) Settings(org.elasticsearch.common.settings.Settings)

Example 3 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.

the class TestJapaneseIterationMarkCharFilter method testNone.

public void testNone() throws IOException {
    // Test no repetition marks
    CharFilter filter = new JapaneseIterationMarkCharFilter(new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"), // no kanji
    false, // no kana
    false);
    assertCharFilterEquals(filter, "時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
}
Also used : CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader)

Example 4 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.

the class TestJapaneseIterationMarkCharFilter method testKanjiOnly.

public void testKanjiOnly() throws IOException {
    // Test kanji only repetition marks
    CharFilter filter = new JapaneseIterationMarkCharFilter(new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"), // kanji
    true, // no kana
    false);
    assertCharFilterEquals(filter, "時時、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
}
Also used : CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader)

Example 5 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.

the class TestJapaneseIterationMarkCharFilterFactory method testKanjiOnlyIterationMarksWithJapaneseTokenizer.

public void testKanjiOnlyIterationMarksWithJapaneseTokenizer() throws IOException {
    JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String, String>());
    tokenizerFactory.inform(new StringMockResourceLoader(""));
    Map<String, String> filterArgs = new HashMap<>();
    filterArgs.put("normalizeKanji", "true");
    filterArgs.put("normalizeKana", "false");
    JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
    CharFilter filter = filterFactory.create(new StringReader("時々馬鹿々々しいところゞゝゝミスヾ"));
    TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
    ((Tokenizer) tokenStream).setReader(filter);
    assertTokenStreamContents(tokenStream, new String[] { "時時", "馬鹿馬鹿しい", "ところ", "ゞ", "ゝ", "ゝ", "ミス", "ヾ" });
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer)

Aggregations

CharFilter (org.apache.lucene.analysis.CharFilter)41 StringReader (java.io.StringReader)40 TokenStream (org.apache.lucene.analysis.TokenStream)26 Tokenizer (org.apache.lucene.analysis.Tokenizer)10 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)7 MappingCharFilter (org.apache.lucene.analysis.charfilter.MappingCharFilter)4 Normalizer2 (com.ibm.icu.text.Normalizer2)3 ArrayList (java.util.ArrayList)3 NormalizeCharMap (org.apache.lucene.analysis.charfilter.NormalizeCharMap)3 NGramTokenizer (org.apache.lucene.analysis.ngram.NGramTokenizer)3 HashMap (java.util.HashMap)2 Settings (org.elasticsearch.common.settings.Settings)2 Index (org.elasticsearch.index.Index)2 AnalysisICUPlugin (org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin)2 IOException (java.io.IOException)1 MockCharFilter (org.apache.lucene.analysis.MockCharFilter)1