Search in sources :

Example 71 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestSimplePatternTokenizer method testEndLookahead.

public void testEndLookahead() throws Exception {
    Tokenizer t = new SimplePatternTokenizer("(ab)+");
    t.setReader(new StringReader("aba"));
    assertTokenStreamContents(t, new String[] { "ab" }, new int[] { 0 }, new int[] { 2 }, 3);
}
Also used : StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 72 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestSimplePatternTokenizer method testBasic.

public void testBasic() throws Exception {
    // get stuff between "'"
    String qpattern = "\\'([^\\']+)\\'";
    String[][] tests = { // pattern        input                    output
    { ":", "boo:and:foo", ": :" }, { qpattern, "aaa 'bbb' 'ccc'", "'bbb' 'ccc'" } };
    for (String[] test : tests) {
        TokenStream stream = new SimplePatternTokenizer(test[0]);
        ((Tokenizer) stream).setReader(new StringReader(test[1]));
        String out = tsToString(stream);
        assertEquals("pattern: " + test[0] + " with input: " + test[1], test[2], out);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 73 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestSimplePatternTokenizer method testEmptyStringPatternOneMatch.

public void testEmptyStringPatternOneMatch() throws Exception {
    Tokenizer t = new SimplePatternTokenizer("a*");
    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
    t.setReader(new StringReader("bbab"));
    t.reset();
    assertTrue(t.incrementToken());
    assertEquals("a", termAtt.toString());
    assertFalse(t.incrementToken());
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 74 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestSimplePatternTokenizer method testOffsetCorrection.

public void testOffsetCorrection() throws Exception {
    final String INPUT = "Günther Günther is here";
    // create MappingCharFilter
    List<String> mappingRules = new ArrayList<>();
    mappingRules.add("\"&uuml;\" => \"ü\"");
    NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
    builder.add("&uuml;", "ü");
    NormalizeCharMap normMap = builder.build();
    CharFilter charStream = new MappingCharFilter(normMap, new StringReader(INPUT));
    // create SimplePatternTokenizer
    Tokenizer stream = new SimplePatternTokenizer("Günther");
    stream.setReader(charStream);
    assertTokenStreamContents(stream, new String[] { "Günther", "Günther" }, new int[] { 0, 13 }, new int[] { 12, 25 }, INPUT.length());
}
Also used : MappingCharFilter(org.apache.lucene.analysis.charfilter.MappingCharFilter) CharFilter(org.apache.lucene.analysis.CharFilter) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) MappingCharFilter(org.apache.lucene.analysis.charfilter.MappingCharFilter) NormalizeCharMap(org.apache.lucene.analysis.charfilter.NormalizeCharMap) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 75 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestPortugueseStemFilter method testEmptyTerm.

public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer, new PortugueseStemFilter(tokenizer));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Aggregations

Tokenizer (org.apache.lucene.analysis.Tokenizer)573 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)286 Analyzer (org.apache.lucene.analysis.Analyzer)265 StringReader (java.io.StringReader)249 TokenStream (org.apache.lucene.analysis.TokenStream)227 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)216 Reader (java.io.Reader)91 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)67 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)63 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)52 StopFilter (org.apache.lucene.analysis.StopFilter)48 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)47 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)45 CharArraySet (org.apache.lucene.analysis.CharArraySet)43 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)36 ESTestCase (org.elasticsearch.test.ESTestCase)30 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)26 HashMap (java.util.HashMap)23 Random (java.util.Random)20 TokenFilter (org.apache.lucene.analysis.TokenFilter)19