use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestCompoundWordTokenFilter method testInvalidOffsets.
// SOLR-2891
// *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
// so in this case we behave like WDF, and preserve any modified offsets
public void testInvalidOffsets() throws Exception {
final CharArraySet dict = makeDictionary("fall");
final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("ü", "ue");
final NormalizeCharMap normMap = builder.build();
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new DictionaryCompoundWordTokenFilter(tokenizer, dict);
return new TokenStreamComponents(tokenizer, filter);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(normMap, reader);
}
};
assertAnalyzesTo(analyzer, "banküberfall", new String[] { "bankueberfall", "fall" }, new int[] { 0, 0 }, new int[] { 12, 12 });
analyzer.close();
}
use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestCompoundWordTokenFilter method testEmptyTerm.
public void testEmptyTerm() throws Exception {
final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict));
}
};
checkOneTerm(a, "", "");
a.close();
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator);
return new TokenStreamComponents(tokenizer, filter);
}
};
checkOneTerm(b, "", "");
b.close();
}
use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestElision method testElision.
public void testElision() throws Exception {
String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(test));
CharArraySet articles = new CharArraySet(asSet("l", "M"), false);
TokenFilter filter = new ElisionFilter(tokenizer, articles);
List<String> tas = filter(filter);
assertEquals("embrouille", tas.get(4));
assertEquals("O'brian", tas.get(6));
assertEquals("enfin", tas.get(7));
}
Aggregations