Search in sources :

Example 36 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.

the class TestPatternReplaceCharFilter method testChain.

//           11111111112222222222333333333
// 012345678901234567890123456789012345678
//  a bb - ccc . --- bb a . ccc ccc bb
//  aa b - c . --- b aa . c c b
public void testChain() throws IOException {
    final String BLOCK = " a bb - ccc . --- bb a . ccc ccc bb";
    CharFilter cs = new PatternReplaceCharFilter(pattern("a"), "aa", new StringReader(BLOCK));
    cs = new PatternReplaceCharFilter(pattern("bb"), "b", cs);
    cs = new PatternReplaceCharFilter(pattern("ccc"), "c", cs);
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts, new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" }, new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 }, new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 }, BLOCK.length());
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader)

Example 37 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.

the class TestPatternReplaceCharFilter method test1block1matchShorter.

//           11111
// 012345678901234
// aa  bb   cc dd
// aa#bb dd
public void test1block1matchShorter() throws IOException {
    final String BLOCK = "aa  bb   cc dd";
    CharFilter cs = new PatternReplaceCharFilter(pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2", new StringReader(BLOCK));
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts, new String[] { "aa#bb", "dd" }, new int[] { 0, 12 }, new int[] { 11, 14 }, BLOCK.length());
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader)

Example 38 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.

the class TestJapaneseIterationMarkCharFilter method testKanaOnly.

public void testKanaOnly() throws IOException {
    // Test kana only repetition marks
    CharFilter filter = new JapaneseIterationMarkCharFilter(new StringReader("時々、おゝのさんと一緒にお寿司が食べたいです。abcところゞゝゝ。"), // no kanji
    false, // kana
    true);
    assertCharFilterEquals(filter, "時々、おおのさんと一緒にお寿司が食べたいです。abcところどころ。");
}
Also used : CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader)

Example 39 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.

the class TestJapaneseIterationMarkCharFilterFactory method testIterationMarksWithKeywordTokenizer.

public void testIterationMarksWithKeywordTokenizer() throws IOException {
    final String text = "時々馬鹿々々しいところゞゝゝミスヾ";
    JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String, String>());
    CharFilter filter = filterFactory.create(new StringReader(text));
    TokenStream tokenStream = new MockTokenizer(MockTokenizer.KEYWORD, false);
    ((Tokenizer) tokenStream).setReader(filter);
    assertTokenStreamContents(tokenStream, new String[] { "時時馬鹿馬鹿しいところどころミスズ" });
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer)

Example 40 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.

the class TestJapaneseIterationMarkCharFilterFactory method testKanaOnlyIterationMarksWithJapaneseTokenizer.

public void testKanaOnlyIterationMarksWithJapaneseTokenizer() throws IOException {
    JapaneseTokenizerFactory tokenizerFactory = new JapaneseTokenizerFactory(new HashMap<String, String>());
    tokenizerFactory.inform(new StringMockResourceLoader(""));
    Map<String, String> filterArgs = new HashMap<>();
    filterArgs.put("normalizeKanji", "false");
    filterArgs.put("normalizeKana", "true");
    JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(filterArgs);
    CharFilter filter = filterFactory.create(new StringReader("時々馬鹿々々しいところゞゝゝミスヾ"));
    TokenStream tokenStream = tokenizerFactory.create(newAttributeFactory());
    ((Tokenizer) tokenStream).setReader(filter);
    assertTokenStreamContents(tokenStream, new String[] { "時々", "馬鹿", "々", "々", "しい", "ところどころ", "ミ", "スズ" });
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer)

Aggregations

CharFilter (org.apache.lucene.analysis.CharFilter)41 StringReader (java.io.StringReader)40 TokenStream (org.apache.lucene.analysis.TokenStream)26 Tokenizer (org.apache.lucene.analysis.Tokenizer)10 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)7 MappingCharFilter (org.apache.lucene.analysis.charfilter.MappingCharFilter)4 Normalizer2 (com.ibm.icu.text.Normalizer2)3 ArrayList (java.util.ArrayList)3 NormalizeCharMap (org.apache.lucene.analysis.charfilter.NormalizeCharMap)3 NGramTokenizer (org.apache.lucene.analysis.ngram.NGramTokenizer)3 HashMap (java.util.HashMap)2 Settings (org.elasticsearch.common.settings.Settings)2 Index (org.elasticsearch.index.Index)2 AnalysisICUPlugin (org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin)2 IOException (java.io.IOException)1 MockCharFilter (org.apache.lucene.analysis.MockCharFilter)1