Search in sources :

Example 1 with StandardFilter

use of org.apache.lucene.analysis.standard.StandardFilter in project lucene-solr by apache.

the class UkrainianMorfologikAnalyzer method createComponents.

/**
   * Creates a
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * which tokenizes all the text in the provided {@link Reader}.
   * 
   * @return A
   *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from an {@link StandardTokenizer} filtered with
   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
   *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
   *         provided and {@link MorfologikFilter} on the Ukrainian dictionary.
   */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopwords);
    if (stemExclusionSet.isEmpty() == false) {
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    }
    result = new MorfologikFilter(result, getDictionary());
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) MorfologikFilter(org.apache.lucene.analysis.morfologik.MorfologikFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 2 with StandardFilter

use of org.apache.lucene.analysis.standard.StandardFilter in project lucene-solr by apache.

the class TestTeeSinkTokenFilter method performance.

/**
   * Not an explicit test, just useful to print out some info on performance
   */
@SuppressWarnings("resource")
public void performance() throws Exception {
    int[] tokCount = { 100, 500, 1000, 2000, 5000, 10000 };
    int[] modCounts = { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
    for (int k = 0; k < tokCount.length; k++) {
        StringBuilder buffer = new StringBuilder();
        System.out.println("-----Tokens: " + tokCount[k] + "-----");
        for (int i = 0; i < tokCount[k]; i++) {
            buffer.append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).append(' ');
        }
        //make sure we produce the same tokens
        TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(standardTokenizer(buffer)));
        TokenStream sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), 100);
        teeStream.consumeAllTokens();
        TokenStream stream = new ModuloTokenFilter(new StandardFilter(standardTokenizer(buffer)), 100);
        CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class);
        CharTermAttribute sinkTok = sink.addAttribute(CharTermAttribute.class);
        for (int i = 0; stream.incrementToken(); i++) {
            assertTrue(sink.incrementToken());
            assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
        }
        //simulate two fields, each being analyzed once, for 20 documents
        for (int j = 0; j < modCounts.length; j++) {
            int tfPos = 0;
            long start = System.currentTimeMillis();
            for (int i = 0; i < 20; i++) {
                stream = new StandardFilter(standardTokenizer(buffer));
                PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
                while (stream.incrementToken()) {
                    tfPos += posIncrAtt.getPositionIncrement();
                }
                stream = new ModuloTokenFilter(new StandardFilter(standardTokenizer(buffer)), modCounts[j]);
                posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
                while (stream.incrementToken()) {
                    tfPos += posIncrAtt.getPositionIncrement();
                }
            }
            long finish = System.currentTimeMillis();
            System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
            int sinkPos = 0;
            //simulate one field with one sink
            start = System.currentTimeMillis();
            for (int i = 0; i < 20; i++) {
                teeStream = new TeeSinkTokenFilter(new StandardFilter(standardTokenizer(buffer)));
                sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), modCounts[j]);
                PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class);
                while (teeStream.incrementToken()) {
                    sinkPos += posIncrAtt.getPositionIncrement();
                }
                //System.out.println("Modulo--------");
                posIncrAtt = sink.getAttribute(PositionIncrementAttribute.class);
                while (sink.incrementToken()) {
                    sinkPos += posIncrAtt.getPositionIncrement();
                }
            }
            finish = System.currentTimeMillis();
            System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
            assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
        }
        System.out.println("- End Tokens: " + tokCount[k] + "-----");
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 3 with StandardFilter

use of org.apache.lucene.analysis.standard.StandardFilter in project lucene-solr by apache.

the class DutchAnalyzer method normalize.

@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
    TokenStream result = new StandardFilter(in);
    result = new LowerCaseFilter(result);
    return result;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 4 with StandardFilter

use of org.apache.lucene.analysis.standard.StandardFilter in project lucene-solr by apache.

the class DutchAnalyzer method createComponents.

/**
   * Returns a (possibly reused) {@link TokenStream} which tokenizes all the 
   * text in the provided {@link Reader}.
   *
   * @return A {@link TokenStream} built from a {@link StandardTokenizer}
   *   filtered with {@link StandardFilter}, {@link LowerCaseFilter}, 
   *   {@link StopFilter}, {@link SetKeywordMarkerFilter} if a stem exclusion set is provided,
   *   {@link StemmerOverrideFilter}, and {@link SnowballFilter}
   */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stoptable);
    if (!excltable.isEmpty())
        result = new SetKeywordMarkerFilter(result, excltable);
    if (stemdict != null)
        result = new StemmerOverrideFilter(result, stemdict);
    result = new SnowballFilter(result, new org.tartarus.snowball.ext.DutchStemmer());
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StopFilter(org.apache.lucene.analysis.StopFilter) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) StemmerOverrideFilter(org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 5 with StandardFilter

use of org.apache.lucene.analysis.standard.StandardFilter in project lucene-solr by apache.

the class NorwegianAnalyzer method createComponents.

/**
   * Creates a
   * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   * which tokenizes all the text in the provided {@link Reader}.
   * 
   * @return A
   *         {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
   *         built from an {@link StandardTokenizer} filtered with
   *         {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
   *         , {@link SetKeywordMarkerFilter} if a stem exclusion set is
   *         provided and {@link SnowballFilter}.
   */
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopwords);
    if (!stemExclusionSet.isEmpty())
        result = new SetKeywordMarkerFilter(result, stemExclusionSet);
    result = new SnowballFilter(result, new NorwegianStemmer());
    return new TokenStreamComponents(source, result);
}
Also used : NorwegianStemmer(org.tartarus.snowball.ext.NorwegianStemmer) TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Aggregations

StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)75 TokenStream (org.apache.lucene.analysis.TokenStream)73 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)60 Tokenizer (org.apache.lucene.analysis.Tokenizer)37 StopFilter (org.apache.lucene.analysis.StopFilter)36 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)36 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)31 SnowballFilter (org.apache.lucene.analysis.snowball.SnowballFilter)14 ElisionFilter (org.apache.lucene.analysis.util.ElisionFilter)8 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)5 DecimalDigitFilter (org.apache.lucene.analysis.core.DecimalDigitFilter)4 ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)4 StopFilter (org.apache.lucene.analysis.core.StopFilter)3 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)3 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)3 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)2 WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)2 Reader (java.io.Reader)1 Analyzer (org.apache.lucene.analysis.Analyzer)1 TokenStreamComponents (org.apache.lucene.analysis.Analyzer.TokenStreamComponents)1