use of org.apache.lucene.analysis.cjk.CJKBigramFilter in project lucene-solr by apache.
the class TestWithCJKBigramFilter method setUp.
@Override
public void setUp() throws Exception {
super.setUp();
/*
* ICUTokenizer+CJKBigramFilter
*/
analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
TokenStream result = new CJKBigramFilter(source);
return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
}
};
/*
* ICUTokenizer+ICUNormalizer2Filter+CJKBigramFilter.
*
* ICUNormalizer2Filter uses nfkc_casefold by default, so this is a language-independent
* superset of CJKWidthFilter's foldings.
*/
analyzer2 = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
// we put this before the CJKBigramFilter, because the normalization might combine
// some halfwidth katakana forms, which will affect the bigramming.
TokenStream result = new ICUNormalizer2Filter(source);
result = new CJKBigramFilter(result);
return new TokenStreamComponents(source, new StopFilter(result, CharArraySet.EMPTY_SET));
}
};
}
Aggregations