use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestPatternReplaceCharFilter method testReplaceByEmpty.
// 012345678
// aa bb cc
public void testReplaceByEmpty() throws IOException {
final String BLOCK = "aa bb cc";
CharFilter cs = new PatternReplaceCharFilter(pattern("(aa)\\s+(bb)\\s+(cc)"), "", new StringReader(BLOCK));
TokenStream ts = whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[] {});
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestPatternReplaceCharFilter method test1block1matchLonger.
// 11111
// 012345678901234
// aa bb cc dd
// aa##bb###cc dd
public void test1block1matchLonger() throws IOException {
final String BLOCK = "aa bb cc dd";
CharFilter cs = new PatternReplaceCharFilter(pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3", new StringReader(BLOCK));
TokenStream ts = whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[] { "aa##bb###cc", "dd" }, new int[] { 0, 9 }, new int[] { 8, 11 }, BLOCK.length());
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestICUNormalizer2CharFilter method testTokenStream.
public void testTokenStream() throws IOException {
// '℃', '№', '㈱', '㌘', 'サ'+'<<', 'ソ'+'<<', '㌰'+'<<'
String input = "℃ № ㈱ ㌘ ザ ゾ ㌰゙";
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE));
Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
tokenStream.setReader(reader);
assertTokenStreamContents(tokenStream, new String[] { "°C", "No", "(株)", "グラム", "ザ", "ゾ", "ピゴ" }, new int[] { 0, 2, 4, 6, 8, 11, 14 }, new int[] { 1, 3, 5, 7, 10, 13, 16 }, input.length());
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestICUNormalizer2CharFilter method testTokenStream2.
public void testTokenStream2() throws IOException {
// '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
String input = "㌰゙5℃№㈱㌘ザゾ";
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), 1, 1);
tokenStream.setReader(reader);
assertTokenStreamContents(tokenStream, new String[] { "ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ" }, new int[] { 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9 }, new int[] { 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11 }, input.length());
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestICUNormalizer2CharFilter method testNormalization.
public void testNormalization() throws IOException {
String input = "ʰ㌰゙5℃№㈱㌘,バッファーの正規化のテスト.㋐㋑㋒㋓㋔カキクケコザジズゼゾg̈각/각நிเกषिchkʷक्षि";
Normalizer2 normalizer = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE);
String expectedOutput = normalizer.normalize(input);
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), normalizer);
char[] tempBuff = new char[10];
StringBuilder output = new StringBuilder();
while (true) {
int length = reader.read(tempBuff);
if (length == -1) {
break;
}
output.append(tempBuff, 0, length);
assertEquals(output.toString(), normalizer.normalize(input.substring(0, reader.correctOffset(output.length()))));
}
assertEquals(expectedOutput, output.toString());
}
Aggregations