Search in sources :

Example 11 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.

the class TestPatternReplaceCharFilter method test2blocksMultiMatches.

//           11111111112222222222333333333
// 012345678901234567890123456789012345678
//   aa bb cc --- aa bb aa. bb aa   bb cc
//   aa##bb cc --- aa##bb aa. bb aa##bb cc
//   aa bb cc --- aa bbbaa. bb aa   b cc
public void test2blocksMultiMatches() throws IOException {
    final String BLOCK = "  aa bb cc --- aa bb aa. bb aa   bb cc";
    CharFilter cs = new PatternReplaceCharFilter(pattern("(aa)\\s+(bb)"), "$1##$2", new StringReader(BLOCK));
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts, new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" }, new int[] { 2, 8, 11, 15, 21, 25, 28, 36 }, new int[] { 7, 10, 14, 20, 24, 27, 35, 38 }, BLOCK.length());
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader)

Example 12 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.

the class TestPatternReplaceCharFilter method test1blockMultiMatches.

//           111111111122222222223333
// 0123456789012345678901234567890123
//   aa bb cc --- aa bb aa   bb   cc
//   aa  bb  cc --- aa bb aa  bb  cc
public void test1blockMultiMatches() throws IOException {
    final String BLOCK = "  aa bb cc --- aa bb aa   bb   cc";
    CharFilter cs = new PatternReplaceCharFilter(pattern("(aa)\\s+(bb)\\s+(cc)"), "$1  $2  $3", new StringReader(BLOCK));
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts, new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" }, new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 }, new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 }, BLOCK.length());
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader)

Example 13 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.

the class TestPatternReplaceCharFilter method test1block1matchSameLength.

// 012345678
// aa bb cc
// aa#bb#cc
public void test1block1matchSameLength() throws IOException {
    final String BLOCK = "aa bb cc";
    CharFilter cs = new PatternReplaceCharFilter(pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3", new StringReader(BLOCK));
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts, new String[] { "aa#bb#cc" }, new int[] { 0 }, new int[] { 8 }, BLOCK.length());
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader)

Example 14 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.

the class TestPatternReplaceCharFilter method checkOutput.

private void checkOutput(String input, String pattern, String replacement, String expectedOutput, String expectedIndexMatchedOutput) throws IOException {
    CharFilter cs = new PatternReplaceCharFilter(pattern(pattern), replacement, new StringReader(input));
    StringBuilder output = new StringBuilder();
    for (int chr = cs.read(); chr > 0; chr = cs.read()) {
        output.append((char) chr);
    }
    StringBuilder indexMatched = new StringBuilder();
    for (int i = 0; i < output.length(); i++) {
        indexMatched.append((cs.correctOffset(i) < 0 ? "-" : input.charAt(cs.correctOffset(i))));
    }
    boolean outputGood = expectedOutput.equals(output.toString());
    boolean indexMatchedGood = expectedIndexMatchedOutput.equals(indexMatched.toString());
    if (!outputGood || !indexMatchedGood || false) {
        System.out.println("Pattern : " + pattern);
        System.out.println("Replac. : " + replacement);
        System.out.println("Input   : " + input);
        System.out.println("Output  : " + output);
        System.out.println("Expected: " + expectedOutput);
        System.out.println("Output/i: " + indexMatched);
        System.out.println("Expected: " + expectedIndexMatchedOutput);
        System.out.println();
    }
    assertTrue("Output doesn't match.", outputGood);
    assertTrue("Index-matched output doesn't match.", indexMatchedGood);
}
Also used : CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader)

Example 15 with CharFilter

use of org.apache.lucene.analysis.CharFilter in project lucene-solr by apache.

the class TestPatternReplaceCharFilter method test1block2matchLonger.

// 01234567
//  a  a
//  aa  aa
public void test1block2matchLonger() throws IOException {
    final String BLOCK = " a  a";
    CharFilter cs = new PatternReplaceCharFilter(pattern("a"), "aa", new StringReader(BLOCK));
    TokenStream ts = whitespaceMockTokenizer(cs);
    assertTokenStreamContents(ts, new String[] { "aa", "aa" }, new int[] { 1, 4 }, new int[] { 2, 5 }, BLOCK.length());
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharFilter(org.apache.lucene.analysis.CharFilter) StringReader(java.io.StringReader)

Aggregations

CharFilter (org.apache.lucene.analysis.CharFilter)41 StringReader (java.io.StringReader)40 TokenStream (org.apache.lucene.analysis.TokenStream)26 Tokenizer (org.apache.lucene.analysis.Tokenizer)10 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)7 MappingCharFilter (org.apache.lucene.analysis.charfilter.MappingCharFilter)4 Normalizer2 (com.ibm.icu.text.Normalizer2)3 ArrayList (java.util.ArrayList)3 NormalizeCharMap (org.apache.lucene.analysis.charfilter.NormalizeCharMap)3 NGramTokenizer (org.apache.lucene.analysis.ngram.NGramTokenizer)3 HashMap (java.util.HashMap)2 Settings (org.elasticsearch.common.settings.Settings)2 Index (org.elasticsearch.index.Index)2 AnalysisICUPlugin (org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin)2 IOException (java.io.IOException)1 MockCharFilter (org.apache.lucene.analysis.MockCharFilter)1