use of org.apache.lucene.analysis.minhash.MinHashFilter in project Anserini by castorini.
the class LexicalLshAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new FeatureVectorsTokenizer();
TokenFilter truncate = new LexicalLshTruncateTokenFilter(source, decimals);
TokenFilter featurePos = new LexicalLshFeaturePositionTokenFilter(truncate);
TokenStream filter;
if (min > 1) {
ShingleFilter shingleFilter = new ShingleFilter(featurePos, min, max);
shingleFilter.setTokenSeparator(" ");
shingleFilter.setOutputUnigrams(false);
shingleFilter.setOutputUnigramsIfNoShingles(false);
filter = new MinHashFilter(shingleFilter, hashCount, bucketCount, hashSetSize, bucketCount > 1);
} else {
filter = new MinHashFilter(featurePos, hashCount, bucketCount, hashSetSize, bucketCount > 1);
}
return new TokenStreamComponents(source, new RemoveDuplicatesTokenFilter(filter));
}
Aggregations