Search in sources :

Example 1 with PorterStemFilter

use of org.apache.lucene.analysis.en.PorterStemFilter in project lucene-solr by apache.

the class TestStemmerOverrideFilter method testOverride.

public void testOverride() throws IOException {
    // lets make booked stem to books
    // the override filter will convert "booked" to "books",
    // but also mark it with KeywordAttribute so Porter will not change it.
    StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder();
    builder.add("booked", "books");
    Tokenizer tokenizer = keywordTokenizer("booked");
    TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.build()));
    assertTokenStreamContents(stream, new String[] { "books" });
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 2 with PorterStemFilter

use of org.apache.lucene.analysis.en.PorterStemFilter in project lucene-solr by apache.

the class TestStemmerOverrideFilter method testRandomRealisticKeyword.

public void testRandomRealisticKeyword() throws IOException {
    Map<String, String> map = new HashMap<>();
    int numTerms = atLeast(50);
    for (int i = 0; i < numTerms; i++) {
        String randomRealisticUnicodeString = TestUtil.randomRealisticUnicodeString(random());
        if (randomRealisticUnicodeString.length() > 0) {
            String value = TestUtil.randomSimpleString(random());
            map.put(randomRealisticUnicodeString, value.isEmpty() ? "a" : value);
        }
    }
    if (map.isEmpty()) {
        map.put("booked", "books");
    }
    // This test might fail if ignoreCase is true since the map might have twice the same key, once
    // lowercased and once uppercased
    StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
    Set<Entry<String, String>> entrySet = map.entrySet();
    for (Entry<String, String> entry : entrySet) {
        builder.add(entry.getKey(), entry.getValue());
    }
    StemmerOverrideMap build = builder.build();
    for (Entry<String, String> entry : entrySet) {
        if (random().nextBoolean()) {
            Tokenizer tokenizer = new KeywordTokenizer();
            tokenizer.setReader(new StringReader(entry.getKey()));
            TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, build));
            assertTokenStreamContents(stream, new String[] { entry.getValue() });
        }
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) StemmerOverrideMap(org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap) Entry(java.util.Map.Entry) StringReader(java.io.StringReader) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 3 with PorterStemFilter

use of org.apache.lucene.analysis.en.PorterStemFilter in project lucene-solr by apache.

the class SmartChineseAnalyzer method createComponents.

@Override
public TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer tokenizer = new HMMChineseTokenizer();
    TokenStream result = tokenizer;
    // result = new LowerCaseFilter(result);
    // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
    // The porter stemming is too strict, this is not a bug, this is a feature:)
    result = new PorterStemFilter(result);
    if (!stopWords.isEmpty()) {
        result = new StopFilter(result, stopWords);
    }
    return new TokenStreamComponents(tokenizer, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StopFilter(org.apache.lucene.analysis.StopFilter) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 4 with PorterStemFilter

use of org.apache.lucene.analysis.en.PorterStemFilter in project cogcomp-nlp by CogComp.

the class ASCIIEnglishAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new EnglishPossessiveFilter(result);
    result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet());
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EnglishPossessiveFilter(org.apache.lucene.analysis.en.EnglishPossessiveFilter) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 5 with PorterStemFilter

use of org.apache.lucene.analysis.en.PorterStemFilter in project cogcomp-nlp by CogComp.

the class MinimalAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new LowerCaseFilter(result);
    result = new EnglishPossessiveFilter(result);
    result = new StopFilter(result, stopwords);
    result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EnglishPossessiveFilter(org.apache.lucene.analysis.en.EnglishPossessiveFilter) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Aggregations

TokenStream (org.apache.lucene.analysis.TokenStream)9 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)9 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)6 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)5 StringReader (java.io.StringReader)2 HashMap (java.util.HashMap)2 Entry (java.util.Map.Entry)2 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)2 StopFilter (org.apache.lucene.analysis.core.StopFilter)2 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)2 ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)2 WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)2 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)2 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)2 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 StopFilter (org.apache.lucene.analysis.StopFilter)1 StemmerOverrideMap (org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap)1