Search in sources :

Example 1 with ASCIIFoldingFilter

use of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter in project lucene-solr by apache.

the class NGramTokenFilterTest method testInvalidOffsets.

// LUCENE-3642
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
// so in this case we behave like WDF, and preserve any modified offsets
public void testInvalidOffsets() throws Exception {
    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
            filters = new NGramTokenFilter(filters, 2, 2);
            return new TokenStreamComponents(tokenizer, filters);
        }
    };
    assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 });
    analyzer.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Analyzer(org.apache.lucene.analysis.Analyzer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) TokenFilter(org.apache.lucene.analysis.TokenFilter)

Example 2 with ASCIIFoldingFilter

use of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter in project vertigo by KleeGroup.

the class DefaultAnalyzer method createComponents.

/**
 * Creates a TokenStream which tokenizes all the text in the provided Reader.
 *
 * @return A TokenStream build from a StandardTokenizer filtered with
 *         StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
 */
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    /* initialisation du token */
    final Tokenizer source = new StandardTokenizer();
    // -----
    /* on retire les élisions*/
    final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true);
    TokenStream filter = new ElisionFilter(source, elisionSet);
    /* on retire article adjectif */
    filter = new StopFilter(filter, stopWords);
    /* on retire les accents */
    filter = new ASCIIFoldingFilter(filter);
    /* on met en minuscule */
    filter = new LowerCaseFilter(filter);
    return new TokenStreamComponents(source, filter);
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) ElisionFilter(org.apache.lucene.analysis.util.ElisionFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 3 with ASCIIFoldingFilter

use of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter in project neo4j by neo4j.

the class StandardFoldingAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    StandardTokenizer src = new StandardTokenizer();
    src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
    TokenStream tok = new ASCIIFoldingFilter(src);
    tok = new LowerCaseFilter(tok);
    tok = new StopFilter(tok, stopwords);
    return new TokenStreamComponents(src, tok);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 4 with ASCIIFoldingFilter

use of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter in project crate by crate.

the class FingerprintAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String s) {
    final Tokenizer tokenizer = new StandardTokenizer();
    TokenStream stream = tokenizer;
    stream = new LowerCaseFilter(stream);
    stream = new ASCIIFoldingFilter(stream, false);
    stream = new StopFilter(stream, stopWords);
    stream = new FingerprintFilter(stream, maxOutputSize, separator);
    return new TokenStreamComponents(tokenizer, stream);
}
Also used : FingerprintFilter(org.apache.lucene.analysis.miscellaneous.FingerprintFilter) TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 5 with ASCIIFoldingFilter

use of org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter in project cogcomp-nlp by CogComp.

the class ASCIIEnglishAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new EnglishPossessiveFilter(result);
    result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet());
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EnglishPossessiveFilter(org.apache.lucene.analysis.en.EnglishPossessiveFilter) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Aggregations

ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)11 TokenStream (org.apache.lucene.analysis.TokenStream)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)7 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)7 StopFilter (org.apache.lucene.analysis.StopFilter)4 StopFilter (org.apache.lucene.analysis.core.StopFilter)4 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)4 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)3 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)3 ElisionFilter (org.apache.lucene.analysis.util.ElisionFilter)3 Analyzer (org.apache.lucene.analysis.Analyzer)2 CharArraySet (org.apache.lucene.analysis.CharArraySet)2 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)2 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)2 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)2 WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)2 ShingleFilter (org.apache.lucene.analysis.shingle.ShingleFilter)2 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1