use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestPatternReplaceCharFilter method testChain.
// 11111111112222222222333333333
// 012345678901234567890123456789012345678
// a bb - ccc . --- bb a . ccc ccc bb
// aa b - c . --- b aa . c c b
public void testChain() throws IOException {
final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb";
CharFilter cs = new PatternReplaceCharFilter(pattern("a"), "aa", new StringReader(BLOCK));
cs = new PatternReplaceCharFilter(pattern("bb"), "b", cs);
cs = new PatternReplaceCharFilter(pattern("ccc"), "c", cs);
TokenStream ts = whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" }, new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 }, new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 }, BLOCK.length());
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestPatternReplaceCharFilter method test1block1matchShorter.
// 11111
// 012345678901234
// aa bb cc dd
// aa#bb dd
public void test1block1matchShorter() throws IOException {
final String BLOCK = "aa bb cc dd";
CharFilter cs = new PatternReplaceCharFilter(pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2", new StringReader(BLOCK));
TokenStream ts = whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[] { "aa#bb", "dd" }, new int[] { 0, 12 }, new int[] { 11, 14 }, BLOCK.length());
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestJapaneseIterationMarkCharFilter method testKanaOnly.
public void testKanaOnly() throws IOException {
// Test kana only repetition marks
CharFilter filter = new JapaneseIterationMarkCharFilter(new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"), // no kanji
false, // kana
true);
assertCharFilterEquals(filter, "時々、おおのさんと一緒にお寿司が食べたいです。abcところどころ。");
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestJapaneseIterationMarkCharFilterFactory method testIterationMarksWithKeywordTokenizer.
public void testIterationMarksWithKeywordTokenizer() throws IOException {
final String text = "時々馬鹿々々しいところゞゝゝミスヾ";
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String, String>());
CharFilter filter = filterFactory.create(new StringReader(text));
TokenStream tokenStream = new MockTokenizer(MockTokenizer.KEYWORD, false);
((Tokenizer) tokenStream).setReader(filter);
assertTokenStreamContents(tokenStream, new String[] { "時時馬鹿馬鹿しいところどころミスズ" });
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestJapaneseIterationMarkCharFilterFactory method testKanaOnlyIterationMarksWithJapaneseTokenizer.
public void testKanaOnlyIterationMarksWithJapaneseTokenizer() throws IOException {
JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String, String>());
tokenizerFactory.inform(new StringMockResourceLoader(""));
Map<String, String> filterArgs = new HashMap<>();
filterArgs.put("normalizeKanji", "false");
filterArgs.put("normalizeKana", "true");
JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
CharFilter filter = filterFactory.create(new StringReader("時々馬鹿々々しいところゞゝゝミスヾ"));
TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
((Tokenizer) tokenStream).setReader(filter);
assertTokenStreamContents(tokenStream, new String[] { "時々", "馬鹿", "々", "々", "しい", "ところどころ", "ミ", "スズ" });
}
Aggregations