use of org.apache.lucene.analysis.standard.StandardTokenizer in project crate by crate.
the class StandardTokenizerFactory method create.
@Override
public Tokenizer create() {
StandardTokenizer tokenizer = new StandardTokenizer();
tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer;
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project Anserini by castorini.
the class DefaultEnglishAnalyzer method createComponents.
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new StandardTokenizer();
TokenStream result;
result = source;
result = new EnglishPossessiveFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, this.stopwords);
if (!this.stemExclusionSet.isEmpty()) {
result = new SetKeywordMarkerFilter(result, this.stemExclusionSet);
}
if (stem) {
if (this.stemmer.compareToIgnoreCase("porter") == 0 || this.stemmer.compareToIgnoreCase("p") == 0) {
result = new PorterStemFilter(result);
} else if (this.stemmer.compareToIgnoreCase("krovetz") == 0 || this.stemmer.compareToIgnoreCase("k") == 0) {
result = new KStemFilter(result);
}
}
return new TokenStreamComponents(source, result);
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project lucene-skos by behas.
the class SKOSAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fileName) {
if (expansionType.equals(ExpansionType.URI)) {
final KeywordTokenizer src = new KeywordTokenizer();
TokenStream tok = new SKOSURIFilter(src, skosEngine, new StandardAnalyzer(), types);
tok = new LowerCaseFilter(tok);
return new TokenStreamComponents(src, tok);
} else {
final StandardTokenizer src = new StandardTokenizer();
src.setMaxTokenLength(maxTokenLength);
TokenStream tok = new StandardFilter(src);
// prior to this we get the classic behavior, standardfilter does it for us.
tok = new SKOSLabelFilter(tok, skosEngine, new StandardAnalyzer(), bufferSize, types);
tok = new LowerCaseFilter(tok);
tok = new StopFilter(tok, stopwords);
tok = new RemoveDuplicatesTokenFilter(tok);
return new TokenStreamComponents(src, tok) {
@Override
protected void setReader(final Reader reader) throws IOException {
src.setMaxTokenLength(maxTokenLength);
super.setReader(reader);
}
};
}
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project cogcomp-nlp by CogComp.
the class ASCIIEnglishAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new StandardFilter(source);
result = new ASCIIFoldingFilter(result);
result = new EnglishPossessiveFilter(result);
result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
result = new LowerCaseFilter(result);
result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet());
result = new PorterStemFilter(result);
return new TokenStreamComponents(source, result);
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project cogcomp-nlp by CogComp.
the class MinimalAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new StandardFilter(source);
result = new ASCIIFoldingFilter(result);
result = new LowerCaseFilter(result);
result = new EnglishPossessiveFilter(result);
result = new StopFilter(result, stopwords);
result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
result = new PorterStemFilter(result);
return new TokenStreamComponents(source, result);
}
Aggregations