use of org.apache.lucene.analysis.standard.StandardTokenizer in project elasticsearch by elastic.
the class SnowballAnalyzer method createComponents.
/** Constructs a {@link StandardTokenizer} filtered by a {@link
StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
and a {@link SnowballFilter} */
@Override
public TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = new StandardTokenizer();
TokenStream result = tokenizer;
// remove the possessive 's for english stemmers
if (name.equals("English") || name.equals("Porter") || name.equals("Lovins"))
result = new EnglishPossessiveFilter(result);
// Use a special lowercase filter for turkish, the stemmer expects it.
if (name.equals("Turkish"))
result = new TurkishLowerCaseFilter(result);
else
result = new LowerCaseFilter(result);
if (stopSet != null)
result = new StopFilter(result, stopSet);
result = new SnowballFilter(result, name);
return new TokenStreamComponents(tokenizer, result);
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project elasticsearch by elastic.
the class StandardHtmlStripAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
final Tokenizer src = new StandardTokenizer();
TokenStream tok = new StandardFilter(src);
tok = new LowerCaseFilter(tok);
if (!stopwords.isEmpty()) {
tok = new StopFilter(tok, stopwords);
}
return new TokenStreamComponents(src, tok);
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project elasticsearch by elastic.
the class FingerprintAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String s) {
final Tokenizer tokenizer = new StandardTokenizer();
TokenStream stream = tokenizer;
stream = new LowerCaseFilter(stream);
stream = new ASCIIFoldingFilter(stream, false);
stream = new StopFilter(stream, stopWords);
stream = new FingerprintFilter(stream, maxOutputSize, separator);
return new TokenStreamComponents(tokenizer, stream);
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project lucene-solr by apache.
the class SoraniAnalyzer method createComponents.
/**
* Creates a
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* which tokenizes all the text in the provided {@link Reader}.
*
* @return A
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link StandardFilter}, {@link SoraniNormalizationFilter},
* {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter}
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is
* provided and {@link SoraniStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new StandardFilter(source);
result = new SoraniNormalizationFilter(result);
result = new LowerCaseFilter(result);
result = new DecimalDigitFilter(result);
result = new StopFilter(result, stopwords);
if (!stemExclusionSet.isEmpty())
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
result = new SoraniStemFilter(result);
return new TokenStreamComponents(source, result);
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project lucene-solr by apache.
the class TestCJKAnalyzer method testChangedOffsets.
/** test that offsets are correct when mappingcharfilter is previously applied */
public void testChangedOffsets() throws IOException {
final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
builder.add("a", "一二");
builder.add("b", "二三");
final NormalizeCharMap norm = builder.build();
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new StandardTokenizer();
return new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer));
}
@Override
protected Reader initReader(String fieldName, Reader reader) {
return new MappingCharFilter(norm, reader);
}
};
assertAnalyzesTo(analyzer, "ab", new String[] { "一二", "二二", "二三" }, new int[] { 0, 0, 1 }, new int[] { 1, 1, 2 });
// note: offsets are strange since this is how the charfilter maps them...
// before bigramming, the 4 tokens look like:
// { 0, 0, 1, 1 },
// { 0, 1, 1, 2 }
analyzer.close();
}
Aggregations