Search in sources :

Example 6 with PorterStemFilter

use of org.apache.lucene.analysis.en.PorterStemFilter in project cogcomp-nlp by CogComp.

the class MinimalAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new LowerCaseFilter(result);
    result = new EnglishPossessiveFilter(result);
    result = new StopFilter(result, stopwords);
    result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EnglishPossessiveFilter(org.apache.lucene.analysis.en.EnglishPossessiveFilter) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 7 with PorterStemFilter

use of org.apache.lucene.analysis.en.PorterStemFilter in project lucene-solr by apache.

the class TestStemmerOverrideFilter method testIgnoreCase.

public void testIgnoreCase() throws IOException {
    // lets make booked stem to books
    // the override filter will convert "booked" to "books",
    // but also mark it with KeywordAttribute so Porter will not change it.
    StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
    builder.add("boOkEd", "books");
    Tokenizer tokenizer = keywordTokenizer("BooKeD");
    TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.build()));
    assertTokenStreamContents(stream, new String[] { "books" });
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 8 with PorterStemFilter

use of org.apache.lucene.analysis.en.PorterStemFilter in project lucene-solr by apache.

the class TestStemmerOverrideFilter method testNoOverrides.

public void testNoOverrides() throws IOException {
    StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
    Tokenizer tokenizer = keywordTokenizer("book");
    TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.build()));
    assertTokenStreamContents(stream, new String[] { "book" });
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 9 with PorterStemFilter

use of org.apache.lucene.analysis.en.PorterStemFilter in project lucene-solr by apache.

the class TestStemmerOverrideFilter method testRandomRealisticWhiteSpace.

public void testRandomRealisticWhiteSpace() throws IOException {
    Map<String, String> map = new HashMap<>();
    Set<String> seen = new HashSet<>();
    int numTerms = atLeast(50);
    boolean ignoreCase = random().nextBoolean();
    for (int i = 0; i < numTerms; i++) {
        String randomRealisticUnicodeString = TestUtil.randomRealisticUnicodeString(random());
        char[] charArray = randomRealisticUnicodeString.toCharArray();
        StringBuilder builder = new StringBuilder();
        for (int j = 0; j < charArray.length; ) {
            int cp = Character.codePointAt(charArray, j, charArray.length);
            if (!Character.isWhitespace(cp)) {
                builder.appendCodePoint(cp);
            }
            j += Character.charCount(cp);
        }
        if (builder.length() > 0) {
            String inputValue = builder.toString();
            // Make sure we don't try to add two inputs that vary only by case:
            String seenInputValue;
            if (ignoreCase) {
                // TODO: can we simply use inputValue.toLowerCase(Locale.ROOT)???
                char[] buffer = inputValue.toCharArray();
                CharacterUtils.toLowerCase(buffer, 0, buffer.length);
                seenInputValue = buffer.toString();
            } else {
                seenInputValue = inputValue;
            }
            if (seen.contains(seenInputValue) == false) {
                seen.add(seenInputValue);
                String value = TestUtil.randomSimpleString(random());
                map.put(inputValue, value.isEmpty() ? "a" : value);
            }
        }
    }
    if (map.isEmpty()) {
        map.put("booked", "books");
    }
    StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase);
    Set<Entry<String, String>> entrySet = map.entrySet();
    StringBuilder input = new StringBuilder();
    List<String> output = new ArrayList<>();
    for (Entry<String, String> entry : entrySet) {
        builder.add(entry.getKey(), entry.getValue());
        if (random().nextBoolean() || output.isEmpty()) {
            input.append(entry.getKey()).append(" ");
            output.add(entry.getValue());
        }
    }
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(input.toString()));
    TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.build()));
    assertTokenStreamContents(stream, output.toArray(new String[0]));
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) Entry(java.util.Map.Entry) StringReader(java.io.StringReader) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) HashSet(java.util.HashSet)

Example 10 with PorterStemFilter

use of org.apache.lucene.analysis.en.PorterStemFilter in project nutch by apache.

the class LuceneAnalyzerUtil method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer source = new ClassicTokenizer();
    TokenStream filter = new LowerCaseFilter(source);
    if (stopSet != null) {
        filter = new StopFilter(filter, stopSet);
    }
    switch(stemFilterType) {
        case PORTERSTEM_FILTER:
            filter = new PorterStemFilter(filter);
            break;
        case ENGLISHMINIMALSTEM_FILTER:
            filter = new EnglishMinimalStemFilter(filter);
            break;
        default:
            break;
    }
    return new TokenStreamComponents(source, filter);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StopFilter(org.apache.lucene.analysis.core.StopFilter) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) ClassicTokenizer(org.apache.lucene.analysis.standard.ClassicTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) ClassicTokenizer(org.apache.lucene.analysis.standard.ClassicTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter) EnglishMinimalStemFilter(org.apache.lucene.analysis.en.EnglishMinimalStemFilter)

Aggregations

TokenStream (org.apache.lucene.analysis.TokenStream)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)10 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)10 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)6 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)5 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)3 StopFilter (org.apache.lucene.analysis.core.StopFilter)3 StringReader (java.io.StringReader)2 HashMap (java.util.HashMap)2 Entry (java.util.Map.Entry)2 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)2 ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)2 WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)2 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)2 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)2 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 StopFilter (org.apache.lucene.analysis.StopFilter)1 EnglishMinimalStemFilter (org.apache.lucene.analysis.en.EnglishMinimalStemFilter)1 StemmerOverrideMap (org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap)1