use of org.apache.lucene.analysis.pattern.PatternReplaceFilter in project lucene-solr by apache.
the class TestSuggestSpellingConverter method testComplicated.
public void testComplicated() throws Exception {
// lowercases, removes field names, other syntax, collapses runs of whitespace, etc.
converter.setAnalyzer(new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
TokenStream filter = new PatternReplaceFilter(tokenizer, Pattern.compile("([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true);
filter = new LowerCaseFilter(filter);
filter = new TrimFilter(filter);
return new TokenStreamComponents(tokenizer, filter);
}
});
assertConvertsTo("test1 +test2", new String[] { "test1 test2" });
assertConvertsTo("test~", new String[] { "test" });
assertConvertsTo("field:test", new String[] { "test" });
assertConvertsTo("This is a test", new String[] { "this is a test" });
assertConvertsTo(" This is a test", new String[] { "this is a test" });
assertConvertsTo("Foo (field:bar) text_hi:हिन्दी ", new String[] { "foo bar हिन्दी" });
}
use of org.apache.lucene.analysis.pattern.PatternReplaceFilter in project jackrabbit-oak by apache.
the class DefaultAnalyzersConfigurationTest method setUp.
@Before
public void setUp() throws Exception {
this.exactPathAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
return new TokenStreamComponents(source);
}
};
this.parentPathIndexingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
return new TokenStreamComponents(source);
}
};
this.parentPathSearchingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
TokenStream filter = new ReverseStringFilter(source);
filter = new PatternReplaceFilter(filter, Pattern.compile("[^\\/]+\\/"), "", false);
filter = new ReverseStringFilter(filter);
return new TokenStreamComponents(source, filter);
}
};
this.directChildrenPathIndexingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
TokenStream filter = new ReverseStringFilter(source);
filter = new LengthFilter(filter, 2, Integer.MAX_VALUE);
filter = new PatternReplaceFilter(filter, Pattern.compile("([^\\/]+)(\\/)"), "$2", false);
filter = new PatternReplaceFilter(filter, Pattern.compile("(\\/)(.+)"), "$2", false);
filter = new ReverseStringFilter(filter);
return new TokenStreamComponents(source, filter);
}
};
this.directChildrenPathSearchingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
return new TokenStreamComponents(source);
}
};
this.allChildrenPathIndexingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new PathHierarchyTokenizer();
TokenStream filter = new PatternCaptureGroupTokenFilter(source, false, Pattern.compile("((\\/).*)"));
filter = new RemoveDuplicatesTokenFilter(filter);
return new TokenStreamComponents(source, filter);
}
};
this.allChildrenPathSearchingAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new KeywordTokenizer();
return new TokenStreamComponents(source);
}
};
}
use of org.apache.lucene.analysis.pattern.PatternReplaceFilter in project varaha by thedatachef.
the class TokenizeText method exec.
/**
* Uses Lucene's StandardAnalyzer and tuns the tokens through several lucene filters
* - LengthFilter: Filter individual words to be of length > minWordSize
* - ShingleFilter: Converts word stream into n-gram stream
* - PatternReplaceFilter: Removes the 'filler' character that ShingleFilter puts in to
* replace stopwords
*/
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() < 1 || input.isNull(0))
return null;
TokenStream stream = analyzer.tokenStream(NOFIELD, input.get(0).toString());
// Let words be long
LengthFilter filtered = new LengthFilter(Version.LUCENE_44, stream, minWordSize, Integer.MAX_VALUE);
DataBag result;
if (minGramSize == 1 && maxGramSize == 1) {
result = fillBag(filtered);
} else {
ShingleFilter nGramStream = new ShingleFilter(filtered, minGramSize, maxGramSize);
nGramStream.setOutputUnigrams(outputUnigrams);
PatternReplaceFilter replacer = new PatternReplaceFilter(nGramStream, SHINGLE_FILLER, NOFIELD, true);
result = fillBag(replacer);
}
return result;
}
Aggregations