Search in sources :

Example 1 with LowerCaseFilter

use of org.apache.lucene.analysis.core.LowerCaseFilter in project che by eclipse.

the class LuceneSearcher method makeAnalyzer.

protected Analyzer makeAnalyzer() {
    return new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new WhitespaceTokenizer();
            TokenStream filter = new LowerCaseFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 2 with LowerCaseFilter

use of org.apache.lucene.analysis.core.LowerCaseFilter in project lucene-solr by apache.

the class SynonymGraphFilterFactory method inform.

@Override
public void inform(ResourceLoader loader) throws IOException {
    final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);
    Analyzer analyzer;
    if (analyzerName != null) {
        analyzer = loadAnalyzer(loader, analyzerName);
    } else {
        analyzer = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer() : factory.create();
                TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
                return new TokenStreamComponents(tokenizer, stream);
            }
        };
    }
    try (Analyzer a = analyzer) {
        String formatClass = format;
        if (format == null || format.equals("solr")) {
            formatClass = SolrSynonymParser.class.getName();
        } else if (format.equals("wordnet")) {
            formatClass = WordnetSynonymParser.class.getName();
        }
        // TODO: expose dedup as a parameter?
        map = loadSynonyms(loader, formatClass, true, a);
    } catch (ParseException e) {
        throw new IOException("Error parsing synonyms file:", e);
    }
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) ParseException(java.text.ParseException) IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 3 with LowerCaseFilter

use of org.apache.lucene.analysis.core.LowerCaseFilter in project jackrabbit-oak by apache.

the class OakAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {
    StandardTokenizer src = new StandardTokenizer(matchVersion, reader);
    TokenStream tok = new LowerCaseFilter(matchVersion, src);
    tok = new WordDelimiterFilter(tok, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE | this.INDEX_ORIGINAL_TERM | WordDelimiterFilter.GENERATE_NUMBER_PARTS, null);
    return new TokenStreamComponents(src, tok);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 4 with LowerCaseFilter

use of org.apache.lucene.analysis.core.LowerCaseFilter in project Krill by KorAP.

the class TextAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream sink = new LowerCaseFilter(source);
    return new TokenStreamComponents(source, sink);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter) TokenStreamComponents(org.apache.lucene.analysis.Analyzer.TokenStreamComponents)

Example 5 with LowerCaseFilter

use of org.apache.lucene.analysis.core.LowerCaseFilter in project vertigo by KleeGroup.

the class DefaultAnalyzer method createComponents.

/**
 * Creates a TokenStream which tokenizes all the text in the provided Reader.
 *
 * @return A TokenStream build from a StandardTokenizer filtered with
 *         StandardFilter, StopFilter, FrenchStemFilter and LowerCaseFilter
 */
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    /* initialisation du token */
    final Tokenizer source = new StandardTokenizer();
    // -----
    /* on retire les élisions*/
    final CharArraySet elisionSet = new CharArraySet(Arrays.asList(LuceneConstants.ELISION_ARTICLES), true);
    TokenStream filter = new ElisionFilter(source, elisionSet);
    /* on retire article adjectif */
    filter = new StopFilter(filter, stopWords);
    /* on retire les accents */
    filter = new ASCIIFoldingFilter(filter);
    /* on met en minuscule */
    filter = new LowerCaseFilter(filter);
    return new TokenStreamComponents(source, filter);
}
Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) ElisionFilter(org.apache.lucene.analysis.util.ElisionFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Aggregations

LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)22 TokenStream (org.apache.lucene.analysis.TokenStream)17 Tokenizer (org.apache.lucene.analysis.Tokenizer)15 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)10 ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)7 StopFilter (org.apache.lucene.analysis.core.StopFilter)6 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)5 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)5 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)4 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)3 WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)3 Analyzer (org.apache.lucene.analysis.Analyzer)2 TokenStreamComponents (org.apache.lucene.analysis.Analyzer.TokenStreamComponents)2 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)2 ShingleFilter (org.apache.lucene.analysis.shingle.ShingleFilter)2 ClassicTokenizer (org.apache.lucene.analysis.standard.ClassicTokenizer)2 ElisionFilter (org.apache.lucene.analysis.util.ElisionFilter)2 IOException (java.io.IOException)1 Reader (java.io.Reader)1 StringReader (java.io.StringReader)1