Search in sources :

Example 1 with PatternReplaceFilter

use of org.apache.lucene.analysis.pattern.PatternReplaceFilter in project lucene-solr by apache.

the class TestSuggestSpellingConverter method testComplicated.

public void testComplicated() throws Exception {
    // lowercases, removes field names, other syntax, collapses runs of whitespace, etc.
    converter.setAnalyzer(new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            TokenStream filter = new PatternReplaceFilter(tokenizer, Pattern.compile("([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true);
            filter = new LowerCaseFilter(filter);
            filter = new TrimFilter(filter);
            return new TokenStreamComponents(tokenizer, filter);
        }
    });
    assertConvertsTo("test1 +test2", new String[] { "test1 test2" });
    assertConvertsTo("test~", new String[] { "test" });
    assertConvertsTo("field:test", new String[] { "test" });
    assertConvertsTo("This is a test", new String[] { "this is a test" });
    assertConvertsTo(" This is  a test", new String[] { "this is a test" });
    assertConvertsTo("Foo (field:bar) text_hi:हिन्दी    ", new String[] { "foo bar हिन्दी" });
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) TrimFilter(org.apache.lucene.analysis.miscellaneous.TrimFilter) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) PatternReplaceFilter(org.apache.lucene.analysis.pattern.PatternReplaceFilter) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 2 with PatternReplaceFilter

use of org.apache.lucene.analysis.pattern.PatternReplaceFilter in project jackrabbit-oak by apache.

the class DefaultAnalyzersConfigurationTest method setUp.

@Before
public void setUp() throws Exception {
    this.exactPathAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            return new TokenStreamComponents(source);
        }
    };
    this.parentPathIndexingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            return new TokenStreamComponents(source);
        }
    };
    this.parentPathSearchingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            TokenStream filter = new ReverseStringFilter(source);
            filter = new PatternReplaceFilter(filter, Pattern.compile("[^\\/]+\\/"), "", false);
            filter = new ReverseStringFilter(filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.directChildrenPathIndexingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            TokenStream filter = new ReverseStringFilter(source);
            filter = new LengthFilter(filter, 2, Integer.MAX_VALUE);
            filter = new PatternReplaceFilter(filter, Pattern.compile("([^\\/]+)(\\/)"), "$2", false);
            filter = new PatternReplaceFilter(filter, Pattern.compile("(\\/)(.+)"), "$2", false);
            filter = new ReverseStringFilter(filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.directChildrenPathSearchingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            return new TokenStreamComponents(source);
        }
    };
    this.allChildrenPathIndexingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new PathHierarchyTokenizer();
            TokenStream filter = new PatternCaptureGroupTokenFilter(source, false, Pattern.compile("((\\/).*)"));
            filter = new RemoveDuplicatesTokenFilter(filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.allChildrenPathSearchingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            return new TokenStreamComponents(source);
        }
    };
}
Also used : RemoveDuplicatesTokenFilter(org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) PathHierarchyTokenizer(org.apache.lucene.analysis.path.PathHierarchyTokenizer) Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) PatternCaptureGroupTokenFilter(org.apache.lucene.analysis.pattern.PatternCaptureGroupTokenFilter) LengthFilter(org.apache.lucene.analysis.miscellaneous.LengthFilter) ReverseStringFilter(org.apache.lucene.analysis.reverse.ReverseStringFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) PathHierarchyTokenizer(org.apache.lucene.analysis.path.PathHierarchyTokenizer) PatternReplaceFilter(org.apache.lucene.analysis.pattern.PatternReplaceFilter) Before(org.junit.Before)

Example 3 with PatternReplaceFilter

use of org.apache.lucene.analysis.pattern.PatternReplaceFilter in project varaha by thedatachef.

the class TokenizeText method exec.

/**
 *       Uses Lucene's StandardAnalyzer and tuns the tokens through several lucene filters
 *       - LengthFilter: Filter individual words to be of length > minWordSize
 *       - ShingleFilter: Converts word stream into n-gram stream
 *       - PatternReplaceFilter: Removes the 'filler' character that ShingleFilter puts in to
 *         replace stopwords
 */
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0))
        return null;
    TokenStream stream = analyzer.tokenStream(NOFIELD, input.get(0).toString());
    // Let words be long
    LengthFilter filtered = new LengthFilter(Version.LUCENE_44, stream, minWordSize, Integer.MAX_VALUE);
    DataBag result;
    if (minGramSize == 1 && maxGramSize == 1) {
        result = fillBag(filtered);
    } else {
        ShingleFilter nGramStream = new ShingleFilter(filtered, minGramSize, maxGramSize);
        nGramStream.setOutputUnigrams(outputUnigrams);
        PatternReplaceFilter replacer = new PatternReplaceFilter(nGramStream, SHINGLE_FILLER, NOFIELD, true);
        result = fillBag(replacer);
    }
    return result;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) LengthFilter(org.apache.lucene.analysis.miscellaneous.LengthFilter) DataBag(org.apache.pig.data.DataBag) PatternReplaceFilter(org.apache.lucene.analysis.pattern.PatternReplaceFilter)

Aggregations

TokenStream (org.apache.lucene.analysis.TokenStream)3 PatternReplaceFilter (org.apache.lucene.analysis.pattern.PatternReplaceFilter)3 Analyzer (org.apache.lucene.analysis.Analyzer)2 Tokenizer (org.apache.lucene.analysis.Tokenizer)2 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)2 LengthFilter (org.apache.lucene.analysis.miscellaneous.LengthFilter)2 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)1 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)1 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)1 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)1 RemoveDuplicatesTokenFilter (org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter)1 TrimFilter (org.apache.lucene.analysis.miscellaneous.TrimFilter)1 PathHierarchyTokenizer (org.apache.lucene.analysis.path.PathHierarchyTokenizer)1 PatternCaptureGroupTokenFilter (org.apache.lucene.analysis.pattern.PatternCaptureGroupTokenFilter)1 ReverseStringFilter (org.apache.lucene.analysis.reverse.ReverseStringFilter)1 ShingleFilter (org.apache.lucene.analysis.shingle.ShingleFilter)1 DataBag (org.apache.pig.data.DataBag)1 Before (org.junit.Before)1