use of org.apache.lucene.analysis.CharFilter in project elasticsearch by elastic.
the class SimpleIcuNormalizerCharFilterTests method testDefaultSetting.
public void testDefaultSetting() throws Exception {
Settings settings = Settings.builder().put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer").build();
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());
CharFilterFactory charFilterFactory = analysis.charFilter.get("myNormalizerChar");
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
String expectedOutput = normalizer.normalize(input);
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
char[] tempBuff = new char[10];
StringBuilder output = new StringBuilder();
while (true) {
int length = inputReader.read(tempBuff);
if (length == -1)
break;
output.append(tempBuff, 0, length);
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
}
assertEquals(expectedOutput, output.toString());
}
use of org.apache.lucene.analysis.CharFilter in project elasticsearch by elastic.
the class SimpleIcuNormalizerCharFilterTests method testNameAndModeSetting.
public void testNameAndModeSetting() throws Exception {
Settings settings = Settings.builder().put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer").put("index.analysis.char_filter.myNormalizerChar.name", "nfkc").put("index.analysis.char_filter.myNormalizerChar.mode", "decompose").build();
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());
CharFilterFactory charFilterFactory = analysis.charFilter.get("myNormalizerChar");
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
String expectedOutput = normalizer.normalize(input);
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
char[] tempBuff = new char[10];
StringBuilder output = new StringBuilder();
while (true) {
int length = inputReader.read(tempBuff);
if (length == -1)
break;
output.append(tempBuff, 0, length);
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
}
assertEquals(expectedOutput, output.toString());
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestJapaneseIterationMarkCharFilter method testNone.
public void testNone() throws IOException {
// Test no repetition marks
CharFilter filter = new JapaneseIterationMarkCharFilter(new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"), // no kanji
false, // no kana
false);
assertCharFilterEquals(filter, "時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestJapaneseIterationMarkCharFilter method testKanjiOnly.
public void testKanjiOnly() throws IOException {
// Test kanji only repetition marks
CharFilter filter = new JapaneseIterationMarkCharFilter(new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"), // kanji
true, // no kana
false);
assertCharFilterEquals(filter, "時時、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。");
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestJapaneseIterationMarkCharFilterFactory method testKanjiOnlyIterationMarksWithJapaneseTokenizer.
public void testKanjiOnlyIterationMarksWithJapaneseTokenizer() throws IOException {
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String, String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
Map<String, String> filterArgs = new HashMap<>();
filterArgs.put("normalizeKanji", "true");
filterArgs.put("normalizeKana", "false");
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
CharFilter filter = filterFactory.create(new StringReader("時々馬鹿々々しいところゞゝゝミスヾ"));
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
((Tokenizer) tokenStream).setReader(filter);
assertTokenStreamContents(tokenStream, new String[] { "時時", "馬鹿馬鹿しい", "ところ", "ゞ", "ゝ", "ゝ", "ミス", "ヾ" });
}
Aggregations