use of org.apache.lucene.analysis.ngram.NGramTokenizer in project lucene-solr by apache.
the class SynonymTokenizer method assertHighlighting.
private void assertHighlighting(Query query, Formatter formatter, String text, String expected, String fieldName) throws IOException, InvalidTokenOffsetsException {
final Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
return new TokenStreamComponents(new NGramTokenizer(4, 4));
}
};
final QueryScorer fragmentScorer = new QueryScorer(query, fieldName);
final Highlighter highlighter = new Highlighter(formatter, fragmentScorer);
highlighter.setTextFragmenter(new SimpleFragmenter(100));
final String fragment = highlighter.getBestFragment(analyzer, fieldName, text);
assertEquals(expected, fragment);
}
use of org.apache.lucene.analysis.ngram.NGramTokenizer in project lucene-solr by apache.
the class TestICUNormalizer2CharFilter method testTokenStream2.
public void testTokenStream2() throws IOException {
// '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<'
String input = "㌰゙5℃№㈱㌘ザゾ";
CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), 1, 1);
tokenStream.setReader(reader);
assertTokenStreamContents(tokenStream, new String[] { "ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ" }, new int[] { 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9 }, new int[] { 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11 }, input.length());
}
Aggregations