use of org.apache.lucene.analysis.charfilter.NormalizeCharMap in project lucene-solr by apache.
the class TestSimplePatternTokenizer method testOffsetCorrection.
public void testOffsetCorrection() throws Exception {
final String INPUT = "Günther Günther is here";
// create MappingCharFilter
List<String> mappingRules = new ArrayList<>();
mappingRules.add("\"ü\" => \"ü\"");
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("ü", "ü");
NormalizeCharMap normMap = builder.build();
CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
// create SimplePatternTokenizer
Tokenizer stream = new SimplePatternTokenizer("Günther");
stream.setReader(charStream);
assertTokenStreamContents(stream, new String[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.length());
}
use of org.apache.lucene.analysis.charfilter.NormalizeCharMap in project lucene-solr by apache.
the class TestSimplePatternSplitTokenizer method testOffsetCorrection.
public void testOffsetCorrection() throws Exception {
final String INPUT = "Günther Günther is here";
// create MappingCharFilter
List<String> mappingRules = new ArrayList<>();
mappingRules.add("\"ü\" => \"ü\"");
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("ü", "ü");
NormalizeCharMap normMap = builder.build();
CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
// create SimplePatternSplitTokenizer
Tokenizer stream = new SimplePatternSplitTokenizer("Günther");
stream.setReader(charStream);
assertTokenStreamContents(stream, new String[] { " ", " is here" }, new int[] { 12, 25 }, new int[] { 13, 33 }, INPUT.length());
}
use of org.apache.lucene.analysis.charfilter.NormalizeCharMap in project lucene-solr by apache.
the class TestPatternTokenizer method testOffsetCorrection.
public void testOffsetCorrection() throws Exception {
final String INPUT = "Günther Günther is here";
// create MappingCharFilter
List<String> mappingRules = new ArrayList<>();
mappingRules.add("\"ü\" => \"ü\"");
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("ü", "ü");
NormalizeCharMap normMap = builder.build();
CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
// create PatternTokenizer
Tokenizer stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("[,;/\\s]+"), -1);
stream.setReader(charStream);
assertTokenStreamContents(stream, new String[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.length());
charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("Günther"), 0);
stream.setReader(charStream);
assertTokenStreamContents(stream, new String[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.length());
}
use of org.apache.lucene.analysis.charfilter.NormalizeCharMap in project lucene-solr by apache.
the class TestCJKAnalyzer method testChangedOffsets.
/** test that offsets are correct when mappingcharfilter is previously applied */
public void testChangedOffsets() throws IOException {
final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("a", "一二");
builder.add("b", "二三");
final NormalizeCharMap norm = builder.build();
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new StandardTokenizer();
return new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer));
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(norm, reader);
}
};
assertAnalyzesTo(analyzer, "ab", new String[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 });
// note: offsets are strange since this is how the charfilter maps them...
// before bigramming, the 4 tokens look like:
// { 0, 0, 1, 1 },
// { 0, 1, 1, 2 }
analyzer.close();
}
use of org.apache.lucene.analysis.charfilter.NormalizeCharMap in project lucene-solr by apache.
the class UkrainianMorfologikAnalyzer method initReader.
@Override
protected Reader initReader(String fieldName, Reader reader) {
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
// different apostrophes
builder.add("’", "'");
builder.add("‘", "'");
builder.add("ʼ", "'");
builder.add("`", "'");
builder.add("´", "'");
// ignored characters
builder.add("́", "");
builder.add("", "");
builder.add("ґ", "г");
builder.add("Ґ", "Г");
NormalizeCharMap normMap = builder.build();
reader = new MappingCharFilter(normMap, reader);
return reader;
}
Aggregations