Search in sources :

Example 1 with RemoveDuplicatesTokenFilter

use of org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter in project jackrabbit-oak by apache.

the class DefaultAnalyzersConfigurationTest method setUp.

@Before
public void setUp() throws Exception {
    this.exactPathAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            return new TokenStreamComponents(source);
        }
    };
    this.parentPathIndexingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            return new TokenStreamComponents(source);
        }
    };
    this.parentPathSearchingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            TokenStream filter = new ReverseStringFilter(source);
            filter = new PatternReplaceFilter(filter, Pattern.compile("[^\\/]+\\/"), "", false);
            filter = new ReverseStringFilter(filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.directChildrenPathIndexingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            TokenStream filter = new ReverseStringFilter(source);
            filter = new LengthFilter(filter, 2, Integer.MAX_VALUE);
            filter = new PatternReplaceFilter(filter, Pattern.compile("([^\\/]+)(\\/)"), "$2", false);
            filter = new PatternReplaceFilter(filter, Pattern.compile("(\\/)(.+)"), "$2", false);
            filter = new ReverseStringFilter(filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.directChildrenPathSearchingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            return new TokenStreamComponents(source);
        }
    };
    this.allChildrenPathIndexingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new PathHierarchyTokenizer();
            TokenStream filter = new PatternCaptureGroupTokenFilter(source, false, Pattern.compile("((\\/).*)"));
            filter = new RemoveDuplicatesTokenFilter(filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.allChildrenPathSearchingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new KeywordTokenizer();
            return new TokenStreamComponents(source);
        }
    };
}
Also used : RemoveDuplicatesTokenFilter(org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) PathHierarchyTokenizer(org.apache.lucene.analysis.path.PathHierarchyTokenizer) Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) PatternCaptureGroupTokenFilter(org.apache.lucene.analysis.pattern.PatternCaptureGroupTokenFilter) LengthFilter(org.apache.lucene.analysis.miscellaneous.LengthFilter) ReverseStringFilter(org.apache.lucene.analysis.reverse.ReverseStringFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) PathHierarchyTokenizer(org.apache.lucene.analysis.path.PathHierarchyTokenizer) PatternReplaceFilter(org.apache.lucene.analysis.pattern.PatternReplaceFilter) Before(org.junit.Before)

Example 2 with RemoveDuplicatesTokenFilter

use of org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter in project Anserini by castorini.

the class LexicalLshAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new FeatureVectorsTokenizer();
    TokenFilter truncate = new LexicalLshTruncateTokenFilter(source, decimals);
    TokenFilter featurePos = new LexicalLshFeaturePositionTokenFilter(truncate);
    TokenStream filter;
    if (min > 1) {
        ShingleFilter shingleFilter = new ShingleFilter(featurePos, min, max);
        shingleFilter.setTokenSeparator(" ");
        shingleFilter.setOutputUnigrams(false);
        shingleFilter.setOutputUnigramsIfNoShingles(false);
        filter = new MinHashFilter(shingleFilter, hashCount, bucketCount, hashSetSize, bucketCount > 1);
    } else {
        filter = new MinHashFilter(featurePos, hashCount, bucketCount, hashSetSize, bucketCount > 1);
    }
    return new TokenStreamComponents(source, new RemoveDuplicatesTokenFilter(filter));
}
Also used : RemoveDuplicatesTokenFilter(org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) MinHashFilter(org.apache.lucene.analysis.minhash.MinHashFilter) FeatureVectorsTokenizer(io.anserini.ann.FeatureVectorsTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) FeatureVectorsTokenizer(io.anserini.ann.FeatureVectorsTokenizer) RemoveDuplicatesTokenFilter(org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter) TokenFilter(org.apache.lucene.analysis.TokenFilter)

Example 3 with RemoveDuplicatesTokenFilter

use of org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter in project lucene-skos by behas.

the class SKOSAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fileName) {
    if (expansionType.equals(ExpansionType.URI)) {
        final KeywordTokenizer src = new KeywordTokenizer();
        TokenStream tok = new SKOSURIFilter(src, skosEngine, new StandardAnalyzer(), types);
        tok = new LowerCaseFilter(tok);
        return new TokenStreamComponents(src, tok);
    } else {
        final StandardTokenizer src = new StandardTokenizer();
        src.setMaxTokenLength(maxTokenLength);
        TokenStream tok = new StandardFilter(src);
        // prior to this we get the classic behavior, standardfilter does it for us.
        tok = new SKOSLabelFilter(tok, skosEngine, new StandardAnalyzer(), bufferSize, types);
        tok = new LowerCaseFilter(tok);
        tok = new StopFilter(tok, stopwords);
        tok = new RemoveDuplicatesTokenFilter(tok);
        return new TokenStreamComponents(src, tok) {

            @Override
            protected void setReader(final Reader reader) throws IOException {
                src.setMaxTokenLength(maxTokenLength);
                super.setReader(reader);
            }
        };
    }
}
Also used : RemoveDuplicatesTokenFilter(org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) Reader(java.io.Reader) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 4 with RemoveDuplicatesTokenFilter

use of org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter in project crate by crate.

the class MultiplexerTokenFilterFactory method getChainAwareTokenFilterFactory.

@Override
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters, List<TokenFilterFactory> previousTokenFilters, Function<String, TokenFilterFactory> allFilters) {
    List<TokenFilterFactory> filters = new ArrayList<>();
    if (preserveOriginal) {
        filters.add(IDENTITY_FILTER);
    }
    for (String filter : filterNames) {
        String[] parts = Strings.tokenizeToStringArray(filter, ",");
        if (parts.length == 1) {
            TokenFilterFactory factory = resolveFilterFactory(allFilters, parts[0]);
            factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters);
            filters.add(factory);
        } else {
            List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
            List<TokenFilterFactory> chain = new ArrayList<>();
            for (String subfilter : parts) {
                TokenFilterFactory factory = resolveFilterFactory(allFilters, subfilter);
                factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
                chain.add(factory);
                existingChain.add(factory);
            }
            filters.add(chainFilters(filter, chain));
        }
    }
    return new TokenFilterFactory() {

        @Override
        public String name() {
            return MultiplexerTokenFilterFactory.this.name();
        }

        @Override
        public TokenStream create(TokenStream tokenStream) {
            List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
            for (TokenFilterFactory tff : filters) {
                functions.add(tff::create);
            }
            return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
        }

        @Override
        public TokenFilterFactory getSynonymFilter() {
            return IDENTITY_FILTER;
        }
    };
}
Also used : RemoveDuplicatesTokenFilter(org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter) Function(java.util.function.Function) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) AbstractTokenFilterFactory(org.elasticsearch.index.analysis.AbstractTokenFilterFactory) TokenFilterFactory(org.elasticsearch.index.analysis.TokenFilterFactory)

Aggregations

TokenStream (org.apache.lucene.analysis.TokenStream)4 RemoveDuplicatesTokenFilter (org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter)4 Tokenizer (org.apache.lucene.analysis.Tokenizer)2 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)2 FeatureVectorsTokenizer (io.anserini.ann.FeatureVectorsTokenizer)1 Reader (java.io.Reader)1 ArrayList (java.util.ArrayList)1 Function (java.util.function.Function)1 Analyzer (org.apache.lucene.analysis.Analyzer)1 TokenFilter (org.apache.lucene.analysis.TokenFilter)1 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)1 StopFilter (org.apache.lucene.analysis.core.StopFilter)1 MinHashFilter (org.apache.lucene.analysis.minhash.MinHashFilter)1 LengthFilter (org.apache.lucene.analysis.miscellaneous.LengthFilter)1 PathHierarchyTokenizer (org.apache.lucene.analysis.path.PathHierarchyTokenizer)1 PatternCaptureGroupTokenFilter (org.apache.lucene.analysis.pattern.PatternCaptureGroupTokenFilter)1 PatternReplaceFilter (org.apache.lucene.analysis.pattern.PatternReplaceFilter)1 ReverseStringFilter (org.apache.lucene.analysis.reverse.ReverseStringFilter)1 ShingleFilter (org.apache.lucene.analysis.shingle.ShingleFilter)1 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)1