Search in sources :

Example 1 with StopFilter

use of org.apache.lucene.analysis.core.StopFilter in project Vidyavana by borsosl.

the class HtmlAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new TransliterationTokenizer();
    TokenStream filter = new StopFilter(tokenizer, new CharArraySet(Arrays.asList("a", "az", "és"), false));
    filter = new TransliterationSynonymFilter(filter);
    return new TokenStreamComponents(tokenizer, filter);
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) StopFilter(org.apache.lucene.analysis.core.StopFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 2 with StopFilter

use of org.apache.lucene.analysis.core.StopFilter in project vertigo by KleeGroup.

the class DefaultAnalyzer method createComponents.

/**
 * Creates a TokenStream which tokenizes all the text in the provided Reader.
 *
 * @return A TokenStream build from a StandardTokenizer filtered with
 *         StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
 */
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    /* initialisation du token */
    final Tokenizer source = new StandardTokenizer();
    // -----
    /* on retire les élisions*/
    final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true);
    TokenStream filter = new ElisionFilter(source, elisionSet);
    /* on retire article adjectif */
    filter = new StopFilter(filter, stopWords);
    /* on retire les accents */
    filter = new ASCIIFoldingFilter(filter);
    /* on met en minuscule */
    filter = new LowerCaseFilter(filter);
    return new TokenStreamComponents(source, filter);
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) ElisionFilter(org.apache.lucene.analysis.util.ElisionFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 3 with StopFilter

use of org.apache.lucene.analysis.core.StopFilter in project neo4j by neo4j.

the class StandardFoldingAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    StandardTokenizer src = new StandardTokenizer();
    src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
    TokenStream tok = new ASCIIFoldingFilter(src);
    tok = new LowerCaseFilter(tok);
    tok = new StopFilter(tok, stopwords);
    return new TokenStreamComponents(src, tok);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 4 with StopFilter

use of org.apache.lucene.analysis.core.StopFilter in project lucene-skos by behas.

the class SKOSAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fileName) {
    if (expansionType.equals(ExpansionType.URI)) {
        final KeywordTokenizer src = new KeywordTokenizer();
        TokenStream tok = new SKOSURIFilter(src, skosEngine, new StandardAnalyzer(), types);
        tok = new LowerCaseFilter(tok);
        return new TokenStreamComponents(src, tok);
    } else {
        final StandardTokenizer src = new StandardTokenizer();
        src.setMaxTokenLength(maxTokenLength);
        TokenStream tok = new StandardFilter(src);
        // prior to this we get the classic behavior, standardfilter does it for us.
        tok = new SKOSLabelFilter(tok, skosEngine, new StandardAnalyzer(), bufferSize, types);
        tok = new LowerCaseFilter(tok);
        tok = new StopFilter(tok, stopwords);
        tok = new RemoveDuplicatesTokenFilter(tok);
        return new TokenStreamComponents(src, tok) {

            @Override
            protected void setReader(final Reader reader) throws IOException {
                src.setMaxTokenLength(maxTokenLength);
                super.setReader(reader);
            }
        };
    }
}
Also used : RemoveDuplicatesTokenFilter(org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) Reader(java.io.Reader) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 5 with StopFilter

use of org.apache.lucene.analysis.core.StopFilter in project cogcomp-nlp by CogComp.

the class ASCIIEnglishAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new EnglishPossessiveFilter(result);
    result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet());
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EnglishPossessiveFilter(org.apache.lucene.analysis.en.EnglishPossessiveFilter) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Aggregations

TokenStream (org.apache.lucene.analysis.TokenStream)9 StopFilter (org.apache.lucene.analysis.core.StopFilter)9 Tokenizer (org.apache.lucene.analysis.Tokenizer)6 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)6 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)5 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)4 ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)4 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)3 CharArraySet (org.apache.lucene.analysis.util.CharArraySet)3 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)2 WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)2 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)2 Reader (java.io.Reader)1 Analyzer (org.apache.lucene.analysis.Analyzer)1 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)1 StopAnalyzer (org.apache.lucene.analysis.core.StopAnalyzer)1 EnglishMinimalStemFilter (org.apache.lucene.analysis.en.EnglishMinimalStemFilter)1 RemoveDuplicatesTokenFilter (org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter)1 ClassicTokenizer (org.apache.lucene.analysis.standard.ClassicTokenizer)1 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)1