use of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter in project lucene-solr by apache.
the class NGramTokenFilterTest method testInvalidOffsets.
// LUCENE-3642
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
// so in this case we behave like WDF, and preserve any modified offsets
public void testInvalidOffsets() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
filters = new NGramTokenFilter(filters, 2, 2);
return new TokenStreamComponents(tokenizer, filters);
}
};
assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 });
analyzer.close();
}
use of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter in project vertigo by KleeGroup.
the class DefaultAnalyzer method createComponents.
/**
* Creates a TokenStream which tokenizes all the text in the provided Reader.
*
* @return A TokenStream build from a StandardTokenizer filtered with
* StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
*/
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
/* initialisation du token */
final Tokenizer source = new StandardTokenizer();
// -----
/* on retire les élisions*/
final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true);
TokenStream filter = new ElisionFilter(source, elisionSet);
/* on retire article adjectif */
filter = new StopFilter(filter, stopWords);
/* on retire les accents */
filter = new ASCIIFoldingFilter(filter);
/* on met en minuscule */
filter = new LowerCaseFilter(filter);
return new TokenStreamComponents(source, filter);
}
use of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter in project neo4j by neo4j.
the class StandardFoldingAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
StandardTokenizer src = new StandardTokenizer();
src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
TokenStream tok = new ASCIIFoldingFilter(src);
tok = new LowerCaseFilter(tok);
tok = new StopFilter(tok, stopwords);
return new TokenStreamComponents(src, tok);
}
use of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter in project crate by crate.
the class FingerprintAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String s) {
final Tokenizer tokenizer = new StandardTokenizer();
TokenStream stream = tokenizer;
stream = new LowerCaseFilter(stream);
stream = new ASCIIFoldingFilter(stream, false);
stream = new StopFilter(stream, stopWords);
stream = new FingerprintFilter(stream, maxOutputSize, separator);
return new TokenStreamComponents(tokenizer, stream);
}
use of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter in project cogcomp-nlp by CogComp.
the class ASCIIEnglishAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new StandardFilter(source);
result = new ASCIIFoldingFilter(result);
result = new EnglishPossessiveFilter(result);
result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
result = new LowerCaseFilter(result);
result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet());
result = new PorterStemFilter(result);
return new TokenStreamComponents(source, result);
}
Aggregations