Search in sources :

Example 1 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project elasticsearch by elastic.

the class PatternAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String s) {
    final Tokenizer tokenizer = new PatternTokenizer(pattern, -1);
    TokenStream stream = tokenizer;
    if (lowercase) {
        stream = new LowerCaseFilter(stream);
    }
    if (stopWords != null) {
        stream = new StopFilter(stream, stopWords);
    }
    return new TokenStreamComponents(tokenizer, stream);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StopFilter(org.apache.lucene.analysis.StopFilter) PatternTokenizer(org.apache.lucene.analysis.pattern.PatternTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) PatternTokenizer(org.apache.lucene.analysis.pattern.PatternTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 2 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class TestFreeTextSuggester method testEndingHole.

// With one ending hole, ShingleFilter produces "of _" and
// we should properly predict from that:
public void testEndingHole() throws Exception {
    // Just deletes "of"
    Analyzer a = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer();
            CharArraySet stopSet = StopFilter.makeStopSet("of");
            return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
        }
    };
    Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(new Input("wizard of oz", 50));
    FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
    sug.build(new InputArrayIterator(keys));
    assertEquals("wizard _ oz/1.00", toString(sug.lookup("wizard of", 10)));
    // Falls back to unigram model, with backoff 0.4 times
    // prop 0.5:
    assertEquals("oz/0.20", toString(sug.lookup("wizard o", 10)));
    a.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) StopFilter(org.apache.lucene.analysis.StopFilter) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer)

Example 3 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class TestFreeTextSuggester method testTwoEndingHoles.

// If the number of ending holes exceeds the ngrams window
// then there are no predictions, because ShingleFilter
// does not produce e.g. a hole only "_ _" token:
public void testTwoEndingHoles() throws Exception {
    // Just deletes "of"
    Analyzer a = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer();
            CharArraySet stopSet = StopFilter.makeStopSet("of");
            return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
        }
    };
    Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(new Input("wizard of of oz", 50));
    FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
    sug.build(new InputArrayIterator(keys));
    assertEquals("", toString(sug.lookup("wizard of of", 10)));
    a.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) StopFilter(org.apache.lucene.analysis.StopFilter) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer)

Example 4 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class AnalyzingInfixSuggesterTest method testSuggestStopFilter.

public void testSuggestStopFilter() throws Exception {
    final CharArraySet stopWords = StopFilter.makeStopSet("a");
    Analyzer indexAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            MockTokenizer tokens = new MockTokenizer();
            return new TokenStreamComponents(tokens, new StopFilter(tokens, stopWords));
        }
    };
    Analyzer queryAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            MockTokenizer tokens = new MockTokenizer();
            return new TokenStreamComponents(tokens, new SuggestStopFilter(tokens, stopWords));
        }
    };
    AnalyzingInfixSuggester suggester = new AnalyzingInfixSuggester(newDirectory(), indexAnalyzer, queryAnalyzer, 3, false);
    Input[] keys = new Input[] { new Input("a bob for apples", 10, new BytesRef("foobaz")) };
    suggester.build(new InputArrayIterator(keys));
    List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("a", random()), 10, true, true);
    assertEquals(1, results.size());
    assertEquals("a bob for apples", results.get(0).key);
    assertEquals("a bob for <b>a</b>pples", results.get(0).highlightKey);
    suggester.close();
    IOUtils.close(suggester, indexAnalyzer, queryAnalyzer);
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) StopFilter(org.apache.lucene.analysis.StopFilter) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) BytesRef(org.apache.lucene.util.BytesRef)

Example 5 with StopFilter

use of org.apache.lucene.analysis.StopFilter in project lucene-solr by apache.

the class SmartChineseAnalyzer method createComponents.

@Override
public TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer tokenizer = new HMMChineseTokenizer();
    TokenStream result = tokenizer;
    // result = new LowerCaseFilter(result);
    // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
    // The porter stemming is too strict, this is not a bug, this is a feature:)
    result = new PorterStemFilter(result);
    if (!stopWords.isEmpty()) {
        result = new StopFilter(result, stopWords);
    }
    return new TokenStreamComponents(tokenizer, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StopFilter(org.apache.lucene.analysis.StopFilter) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Aggregations

StopFilter (org.apache.lucene.analysis.StopFilter)68 TokenStream (org.apache.lucene.analysis.TokenStream)57 Tokenizer (org.apache.lucene.analysis.Tokenizer)55 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)49 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)44 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)36 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)33 SnowballFilter (org.apache.lucene.analysis.snowball.SnowballFilter)17 CharArraySet (org.apache.lucene.analysis.CharArraySet)9 Analyzer (org.apache.lucene.analysis.Analyzer)7 DecimalDigitFilter (org.apache.lucene.analysis.core.DecimalDigitFilter)6 ElisionFilter (org.apache.lucene.analysis.util.ElisionFilter)6 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)5 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)4 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)4 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)4 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)4 ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)4 Reader (java.io.Reader)3 TokenFilter (org.apache.lucene.analysis.TokenFilter)3