use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestPatternReplaceCharFilter method checkOutput.
private void checkOutput(String input, String pattern, String replacement, String expectedOutput, String expectedIndexMatchedOutput) throws IOException {
CharFilter cs = new PatternReplaceCharFilter(pattern(pattern), replacement, new StringReader(input));
StringBuilder output = new StringBuilder();
for (int chr = cs.read(); chr > 0; chr = cs.read()) {
output.append((char) chr);
}
StringBuilder indexMatched = new StringBuilder();
for (int i = 0; i < output.length(); i++) {
indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i))));
}
boolean outputGood = expectedOutput.equals(output.toString());
boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString());
if (!outputGood || !indexMatchedGood || false) {
System.out.println("Pattern : " + pattern);
System.out.println("Replac. : " + replacement);
System.out.println("Input : " + input);
System.out.println("Output : " + output);
System.out.println("Expected: " + expectedOutput);
System.out.println("Output/i: " + indexMatched);
System.out.println("Expected: " + expectedIndexMatchedOutput);
System.out.println();
}
assertTrue("Output doesn't match.", outputGood);
assertTrue("Index-matched output doesn't match.", indexMatchedGood);
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestPatternReplaceCharFilter method test1block2matchLonger.
// 01234567
// a a
// aa aa
public void test1block2matchLonger() throws IOException {
final String BLOCK = " a a";
CharFilter cs = new PatternReplaceCharFilter(pattern("a"), "aa", new StringReader(BLOCK));
TokenStream ts = whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[] { "aa", "aa" }, new int[] { 1, 4 }, new int[] { 2, 5 }, BLOCK.length());
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestPatternReplaceCharFilter method testNothingChange.
// 1111
// 01234567890123
// this is test.
public void testNothingChange() throws IOException {
final String BLOCK = "this is test.";
CharFilter cs = new PatternReplaceCharFilter(pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3", new StringReader(BLOCK));
TokenStream ts = whitespaceMockTokenizer(cs);
assertTokenStreamContents(ts, new String[] { "this", "is", "test." }, new int[] { 0, 5, 8 }, new int[] { 4, 7, 13 }, BLOCK.length());
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestSimplePatternSplitTokenizer method testOffsetCorrection.
public void testOffsetCorrection() throws Exception {
final String INPUT = "Günther Günther is here";
// create MappingCharFilter
List<String> mappingRules = new ArrayList<>();
mappingRules.add("\"ü\" => \"ü\"");
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("ü", "ü");
NormalizeCharMap normMap = builder.build();
CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
// create SimplePatternSplitTokenizer
Tokenizer stream = new SimplePatternSplitTokenizer("Günther");
stream.setReader(charStream);
assertTokenStreamContents(stream, new String[] { " ", " is here" }, new int[] { 12, 25 }, new int[] { 13, 33 }, INPUT.length());
}
use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.
the class TestPatternTokenizer method testOffsetCorrection.
public void testOffsetCorrection() throws Exception {
final String INPUT = "Günther Günther is here";
// create MappingCharFilter
List<String> mappingRules = new ArrayList<>();
mappingRules.add("\"ü\" => \"ü\"");
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("ü", "ü");
NormalizeCharMap normMap = builder.build();
CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
// create PatternTokenizer
Tokenizer stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("[,;/\\s]+"), -1);
stream.setReader(charStream);
assertTokenStreamContents(stream, new String[] { "Günther", "Günther", "is", "here" }, new int[] { 0, 13, 26, 29 }, new int[] { 12, 25, 28, 33 }, INPUT.length());
charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
stream = new PatternTokenizer(newAttributeFactory(), Pattern.compile("Günther"), 0);
stream.setReader(charStream);
assertTokenStreamContents(stream, new String[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.length());
}
Aggregations