Search in sources :

Example 41 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class TestICUFoldingFilter method testEmptyTerm.

public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer, new ICUFoldingFilter(tokenizer));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 42 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class TestICUTransformFilter method testOptimizerSurrogate.

public void testOptimizerSurrogate() throws Exception {
    // convert CJK UNIFIED IDEOGRAPH-20087 to an x
    String rules = "\\U00020087 > x;";
    Transliterator custom = Transliterator.createFromRules("test", rules, Transliterator.FORWARD);
    assertTrue(custom.getFilter() == null);
    final KeywordTokenizer input = new KeywordTokenizer();
    input.setReader(new StringReader(""));
    new ICUTransformFilter(input, custom);
    assertTrue(custom.getFilter().equals(new UnicodeSet("[\\U00020087]")));
}
Also used : StringReader(java.io.StringReader) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) UnicodeSet(com.ibm.icu.text.UnicodeSet) Transliterator(com.ibm.icu.text.Transliterator)

Example 43 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project jackrabbit-oak by apache.

the class DefaultAnalyzersConfigurationTest method setUp.

@Before
public void setUp() throws Exception {
    this.exactPathAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            return new TokenStreamComponents(source);
        }
    };
    this.parentPathIndexingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            return new TokenStreamComponents(source);
        }
    };
    this.parentPathSearchingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            TokenStream filter = new ReverseStringFilter(source);
            filter = new PatternReplaceFilter(filter, Pattern.compile("[^\\/]+\\/"), "", false);
            filter = new ReverseStringFilter(filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.directChildrenPathIndexingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            TokenStream filter = new ReverseStringFilter(source);
            filter = new LengthFilter(filter, 2, Integer.MAX_VALUE);
            filter = new PatternReplaceFilter(filter, Pattern.compile("([^\\/]+)(\\/)"), "$2", false);
            filter = new PatternReplaceFilter(filter, Pattern.compile("(\\/)(.+)"), "$2", false);
            filter = new ReverseStringFilter(filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.directChildrenPathSearchingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            return new TokenStreamComponents(source);
        }
    };
    this.allChildrenPathIndexingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new PathHierarchyTokenizer();
            TokenStream filter = new PatternCaptureGroupTokenFilter(source, false, Pattern.compile("((\\/).*)"));
            filter = new RemoveDuplicatesTokenFilter(filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.allChildrenPathSearchingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            return new TokenStreamComponents(source);
        }
    };
}
Also used : RemoveDuplicatesTokenFilter(org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) PathHierarchyTokenizer(org.apache.lucene.analysis.path.PathHierarchyTokenizer) Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) PatternCaptureGroupTokenFilter(org.apache.lucene.analysis.pattern.PatternCaptureGroupTokenFilter) LengthFilter(org.apache.lucene.analysis.miscellaneous.LengthFilter) ReverseStringFilter(org.apache.lucene.analysis.reverse.ReverseStringFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) PathHierarchyTokenizer(org.apache.lucene.analysis.path.PathHierarchyTokenizer) PatternReplaceFilter(org.apache.lucene.analysis.pattern.PatternReplaceFilter) Before(org.junit.Before)

Example 44 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project stargate-core by tuplejump.

the class CaseInsensitiveKeywordAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    KeywordTokenizer source = new KeywordTokenizer();
    LowerCaseFilter filter = new LowerCaseFilter(source);
    return new TokenStreamComponents(source, filter);
}
Also used : KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 45 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-skos by behas.

the class SKOSAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fileName) {
    if (expansionType.equals(ExpansionType.URI)) {
        final KeywordTokenizer src = new KeywordTokenizer();
        TokenStream tok = new SKOSURIFilter(src, skosEngine, new StandardAnalyzer(), types);
        tok = new LowerCaseFilter(tok);
        return new TokenStreamComponents(src, tok);
    } else {
        final StandardTokenizer src = new StandardTokenizer();
        src.setMaxTokenLength(maxTokenLength);
        TokenStream tok = new StandardFilter(src);
        // prior to this we get the classic behavior, standardfilter does it for us.
        tok = new SKOSLabelFilter(tok, skosEngine, new StandardAnalyzer(), bufferSize, types);
        tok = new LowerCaseFilter(tok);
        tok = new StopFilter(tok, stopwords);
        tok = new RemoveDuplicatesTokenFilter(tok);
        return new TokenStreamComponents(src, tok) {

            @Override
            protected void setReader(final Reader reader) throws IOException {
                src.setMaxTokenLength(maxTokenLength);
                super.setReader(reader);
            }
        };
    }
}
Also used : RemoveDuplicatesTokenFilter(org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) Reader(java.io.Reader) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Aggregations

KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)95 Tokenizer (org.apache.lucene.analysis.Tokenizer)86 Analyzer (org.apache.lucene.analysis.Analyzer)75 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)64 TokenStream (org.apache.lucene.analysis.TokenStream)14 StringReader (java.io.StringReader)11 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)11 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)4 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)4 Random (java.util.Random)3 CharArraySet (org.apache.lucene.analysis.CharArraySet)3 LetterTokenizer (org.apache.lucene.analysis.core.LetterTokenizer)3 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)3 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)3 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)3 Transliterator (com.ibm.icu.text.Transliterator)2 UnicodeSet (com.ibm.icu.text.UnicodeSet)2 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 LowerCaseTokenizer (org.apache.lucene.analysis.core.LowerCaseTokenizer)2 RemoveDuplicatesTokenFilter (org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter)2