use of com.ibm.icu.text.Normalizer2 in project elasticsearch by elastic.
the class SimpleIcuNormalizerCharFilterTests method testDefaultSetting.
public void testDefaultSetting() throws Exception {
Settings settings = Settings.builder().put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer").build();
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());
CharFilterFactory charFilterFactory = analysis.charFilter.get("myNormalizerChar");
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
String expectedOutput = normalizer.normalize(input);
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
char[] tempBuff = new char[10];
StringBuilder output = new StringBuilder();
while (true) {
int length = inputReader.read(tempBuff);
if (length == -1)
break;
output.append(tempBuff, 0, length);
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
}
assertEquals(expectedOutput, output.toString());
}
use of com.ibm.icu.text.Normalizer2 in project elasticsearch by elastic.
the class SimpleIcuNormalizerCharFilterTests method testNameAndModeSetting.
public void testNameAndModeSetting() throws Exception {
Settings settings = Settings.builder().put("index.analysis.char_filter.myNormalizerChar.type", "icu_normalizer").put("index.analysis.char_filter.myNormalizerChar.name", "nfkc").put("index.analysis.char_filter.myNormalizerChar.mode", "decompose").build();
TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), settings, new AnalysisICUPlugin());
CharFilterFactory charFilterFactory = analysis.charFilter.get("myNormalizerChar");
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
String expectedOutput = normalizer.normalize(input);
CharFilter inputReader = (CharFilter) charFilterFactory.create(new StringReader(input));
char[] tempBuff = new char[10];
StringBuilder output = new StringBuilder();
while (true) {
int length = inputReader.read(tempBuff);
if (length == -1)
break;
output.append(tempBuff, 0, length);
assertEquals(output.toString(), normalizer.normalize(input.substring(0, inputReader.correctOffset(output.length()))));
}
assertEquals(expectedOutput, output.toString());
}
use of com.ibm.icu.text.Normalizer2 in project lucene-solr by apache.
the class TestICUNormalizer2CharFilter method testNormalization.
public void testNormalization() throws IOException {
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
String expectedOutput = normalizer.normalize(input);
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer);
char[] tempBuff = new char[10];
StringBuilder output = new StringBuilder();
while (true) {
int length = reader.read(tempBuff);
if (length == -1) {
break;
}
output.append(tempBuff, 0, length);
assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length()))));
}
assertEquals(expectedOutput, output.toString());
}
use of com.ibm.icu.text.Normalizer2 in project elasticsearch by elastic.
the class IcuFoldingTokenFilterFactory method create.
@Override
public TokenStream create(TokenStream tokenStream) {
// ICUFoldingFilter lacks a constructor for adding filtering so we implemement it here
if (unicodeSetFilter != null) {
Normalizer2 base = Normalizer2.getInstance(ICUFoldingFilter.class.getResourceAsStream("utr30.nrm"), "utr30", Normalizer2.Mode.COMPOSE);
UnicodeSet unicodeSet = new UnicodeSet(unicodeSetFilter);
unicodeSet.freeze();
Normalizer2 filtered = new FilteredNormalizer2(base, unicodeSet);
return new org.apache.lucene.analysis.icu.ICUNormalizer2Filter(tokenStream, filtered);
} else {
return new ICUFoldingFilter(tokenStream);
}
}
Aggregations