use of org.apache.lucene.analysis.miscellaneous.LengthFilter in project jackrabbit-oak by apache.
the class DefaultAnalyzersConfigurationTest method setUp.
@Before
public void setUp() throws Exception {
this.exactPathAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
return new TokenStreamComponents(source);
}
};
this.parentPathIndexingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
return new TokenStreamComponents(source);
}
};
this.parentPathSearchingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
TokenStream filter = new ReverseStringFilter(source);
filter = new PatternReplaceFilter(filter, Pattern.compile("[^\\/]+\\/"), "", false);
filter = new ReverseStringFilter(filter);
return new TokenStreamComponents(source, filter);
}
};
this.directChildrenPathIndexingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
TokenStream filter = new ReverseStringFilter(source);
filter = new LengthFilter(filter, 2, Integer.MAX_VALUE);
filter = new PatternReplaceFilter(filter, Pattern.compile("([^\\/]+)(\\/)"), "$2", false);
filter = new PatternReplaceFilter(filter, Pattern.compile("(\\/)(.+)"), "$2", false);
filter = new ReverseStringFilter(filter);
return new TokenStreamComponents(source, filter);
}
};
this.directChildrenPathSearchingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
return new TokenStreamComponents(source);
}
};
this.allChildrenPathIndexingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new PathHierarchyTokenizer();
TokenStream filter = new PatternCaptureGroupTokenFilter(source, false, Pattern.compile("((\\/).*)"));
filter = new RemoveDuplicatesTokenFilter(filter);
return new TokenStreamComponents(source, filter);
}
};
this.allChildrenPathSearchingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
return new TokenStreamComponents(source);
}
};
}
use of org.apache.lucene.analysis.miscellaneous.LengthFilter in project varaha by thedatachef.
the class TokenizeText method exec.
/**
* Uses Lucene's StandardAnalyzer and tuns the tokens through several lucene filters
* - LengthFilter: Filter individual words to be of length > minWordSize
* - ShingleFilter: Converts word stream into n-gram stream
* - PatternReplaceFilter: Removes the 'filler' character that ShingleFilter puts in to
* replace stopwords
*/
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() < 1 || input.isNull(0))
return null;
TokenStream stream = analyzer.tokenStream(NOFIELD, input.get(0).toString());
// Let words be long
LengthFilter filtered = new LengthFilter(Version.LUCENE_44, stream, minWordSize, Integer.MAX_VALUE);
DataBag result;
if (minGramSize == 1 && maxGramSize == 1) {
result = fillBag(filtered);
} else {
ShingleFilter nGramStream = new ShingleFilter(filtered, minGramSize, maxGramSize);
nGramStream.setOutputUnigrams(outputUnigrams);
PatternReplaceFilter replacer = new PatternReplaceFilter(nGramStream, SHINGLE_FILLER, NOFIELD, true);
result = fillBag(replacer);
}
return result;
}
use of org.apache.lucene.analysis.miscellaneous.LengthFilter in project crate by crate.
the class CommonAnalysisPlugin method getPreConfiguredTokenFilters.
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
List<PreConfiguredTokenFilter> filters = new ArrayList<>();
filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("common_grams", false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
filters.add(PreConfiguredTokenFilter.singleton("edge_ngram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilterFactory.SIDE_FRONT, EdgeNGramTokenFilterFactory.SIDE_BACK, EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL)));
filters.add(PreConfiguredTokenFilter.singleton("elision", true, input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)));
filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE)));
filters.add(PreConfiguredTokenFilter.singleton("limit", false, input -> new LimitTokenCountFilter(input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)));
filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
TokenStream ts = new ShingleFilter(input);
/**
* We disable the graph analysis on this token stream
* because it produces shingles of different size.
* Graph analysis on such token stream is useless and dangerous as it may create too many paths
* since shingles of different size are not aligned in terms of positions.
*/
ts.addAttribute(DisableGraphAttribute.class);
return ts;
}));
filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
// The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
return filters;
}
Aggregations