use of org.apache.lucene.analysis.charfilter.NormalizeCharMap in project lucene-solr by apache.
the class TestCompoundWordTokenFilter method testInvalidOffsets.
// SOLR-2891
// *CompoundWordTokenFilter blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case ü -> ue)
// so in this case we behave like WDF, and preserve any modified offsets
public void testInvalidOffsets() throws Exception {
final CharArraySet dict = makeDictionary("fall");
final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("ü", "ue");
final NormalizeCharMap normMap = builder.build();
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new DictionaryCompoundWordTokenFilter(tokenizer, dict);
return new TokenStreamComponents(tokenizer, filter);
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(normMap, reader);
}
};
assertAnalyzesTo(analyzer, "banküberfall", new String[] { "bankueberfall", "fall" }, new int[] { 0, 0 }, new int[] { 12, 12 });
analyzer.close();
}
use of org.apache.lucene.analysis.charfilter.NormalizeCharMap in project lucene-solr by apache.
the class UkrainianMorfologikAnalyzer method initReader.
@Override
protected Reader initReader(String fieldName, Reader reader) {
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
// different apostrophes
builder.add("’", "'");
builder.add("‘", "'");
builder.add("ʼ", "'");
builder.add("`", "'");
builder.add("´", "'");
// ignored characters
builder.add("́", "");
builder.add("", "");
builder.add("ґ", "г");
builder.add("Ґ", "Г");
NormalizeCharMap normMap = builder.build();
reader = new MappingCharFilter(normMap, reader);
return reader;
}
use of org.apache.lucene.analysis.charfilter.NormalizeCharMap in project lucene-solr by apache.
the class TestPathHierarchyTokenizer method testNormalizeWinDelimToLinuxDelim.
public void testNormalizeWinDelimToLinuxDelim() throws Exception {
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("\\", "/");
NormalizeCharMap normMap = builder.build();
String path = "c:\\a\\b\\c";
Reader cs = new MappingCharFilter(normMap, new StringReader(path));
PathHierarchyTokenizer t = new PathHierarchyTokenizer(newAttributeFactory(), DEFAULT_DELIMITER, DEFAULT_DELIMITER, DEFAULT_SKIP);
t.setReader(cs);
assertTokenStreamContents(t, new String[] { "c:", "c:/a", "c:/a/b", "c:/a/b/c" }, new int[] { 0, 0, 0, 0 }, new int[] { 2, 4, 6, 8 }, new int[] { 1, 0, 0, 0 }, path.length());
}
Aggregations