use of org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter in project jackrabbit-oak by apache.
the class DefaultAnalyzersConfigurationTest method setUp.
@Before
public void setUp() throws Exception {
this.exactPathAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
return new TokenStreamComponents(source);
}
};
this.parentPathIndexingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
return new TokenStreamComponents(source);
}
};
this.parentPathSearchingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
TokenStream filter = new ReverseStringFilter(source);
filter = new PatternReplaceFilter(filter, Pattern.compile("[^\\/]+\\/"), "", false);
filter = new ReverseStringFilter(filter);
return new TokenStreamComponents(source, filter);
}
};
this.directChildrenPathIndexingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
TokenStream filter = new ReverseStringFilter(source);
filter = new LengthFilter(filter, 2, Integer.MAX_VALUE);
filter = new PatternReplaceFilter(filter, Pattern.compile("([^\\/]+)(\\/)"), "$2", false);
filter = new PatternReplaceFilter(filter, Pattern.compile("(\\/)(.+)"), "$2", false);
filter = new ReverseStringFilter(filter);
return new TokenStreamComponents(source, filter);
}
};
this.directChildrenPathSearchingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
return new TokenStreamComponents(source);
}
};
this.allChildrenPathIndexingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new PathHierarchyTokenizer();
TokenStream filter = new PatternCaptureGroupTokenFilter(source, false, Pattern.compile("((\\/).*)"));
filter = new RemoveDuplicatesTokenFilter(filter);
return new TokenStreamComponents(source, filter);
}
};
this.allChildrenPathSearchingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
return new TokenStreamComponents(source);
}
};
}
use of org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter in project Anserini by castorini.
the class LexicalLshAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new FeatureVectorsTokenizer();
TokenFilter truncate = new LexicalLshTruncateTokenFilter(source, decimals);
TokenFilter featurePos = new LexicalLshFeaturePositionTokenFilter(truncate);
TokenStream filter;
if (min > 1) {
ShingleFilter shingleFilter = new ShingleFilter(featurePos, min, max);
shingleFilter.setTokenSeparator(" ");
shingleFilter.setOutputUnigrams(false);
shingleFilter.setOutputUnigramsIfNoShingles(false);
filter = new MinHashFilter(shingleFilter, hashCount, bucketCount, hashSetSize, bucketCount > 1);
} else {
filter = new MinHashFilter(featurePos, hashCount, bucketCount, hashSetSize, bucketCount > 1);
}
return new TokenStreamComponents(source, new RemoveDuplicatesTokenFilter(filter));
}
use of org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter in project lucene-skos by behas.
the class SKOSAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fileName) {
if (expansionType.equals(ExpansionType.URI)) {
final KeywordTokenizer src = new KeywordTokenizer();
TokenStream tok = new SKOSURIFilter(src, skosEngine, new StandardAnalyzer(), types);
tok = new LowerCaseFilter(tok);
return new TokenStreamComponents(src, tok);
} else {
final StandardTokenizer src = new StandardTokenizer();
src.setMaxTokenLength(maxTokenLength);
TokenStream tok = new StandardFilter(src);
// prior to this we get the classic behavior, standardfilter does it for us.
tok = new SKOSLabelFilter(tok, skosEngine, new StandardAnalyzer(), bufferSize, types);
tok = new LowerCaseFilter(tok);
tok = new StopFilter(tok, stopwords);
tok = new RemoveDuplicatesTokenFilter(tok);
return new TokenStreamComponents(src, tok) {
@Override
protected void setReader(final Reader reader) throws IOException {
src.setMaxTokenLength(maxTokenLength);
super.setReader(reader);
}
};
}
}
use of org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter in project crate by crate.
the class MultiplexerTokenFilterFactory method getChainAwareTokenFilterFactory.
@Override
public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters, List<TokenFilterFactory> previousTokenFilters, Function<String, TokenFilterFactory> allFilters) {
List<TokenFilterFactory> filters = new ArrayList<>();
if (preserveOriginal) {
filters.add(IDENTITY_FILTER);
}
for (String filter : filterNames) {
String[] parts = Strings.tokenizeToStringArray(filter, ",");
if (parts.length == 1) {
TokenFilterFactory factory = resolveFilterFactory(allFilters, parts[0]);
factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, previousTokenFilters, allFilters);
filters.add(factory);
} else {
List<TokenFilterFactory> existingChain = new ArrayList<>(previousTokenFilters);
List<TokenFilterFactory> chain = new ArrayList<>();
for (String subfilter : parts) {
TokenFilterFactory factory = resolveFilterFactory(allFilters, subfilter);
factory = factory.getChainAwareTokenFilterFactory(tokenizer, charFilters, existingChain, allFilters);
chain.add(factory);
existingChain.add(factory);
}
filters.add(chainFilters(filter, chain));
}
}
return new TokenFilterFactory() {
@Override
public String name() {
return MultiplexerTokenFilterFactory.this.name();
}
@Override
public TokenStream create(TokenStream tokenStream) {
List<Function<TokenStream, TokenStream>> functions = new ArrayList<>();
for (TokenFilterFactory tff : filters) {
functions.add(tff::create);
}
return new RemoveDuplicatesTokenFilter(new MultiplexTokenFilter(tokenStream, functions));
}
@Override
public TokenFilterFactory getSynonymFilter() {
return IDENTITY_FILTER;
}
};
}
Aggregations