Search in sources :

Example 31 with StandardTokenizer

use of org.apache.lucene.analysis.standard.StandardTokenizer in project symja_android_library by axkr.

the class Pods method getStemForm.

private static String getStemForm(String term) {
    StandardTokenizer stdToken = new StandardTokenizer();
    stdToken.setReader(new StringReader(term));
    try (TokenStream tokenStream = new PorterStemFilter(stdToken)) {
        tokenStream.reset();
        // eliminate duplicate tokens by adding them to a set
        Set<String> stems = new HashSet<>();
        CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()) {
            stems.add(token.toString());
        }
        // if stem form was not found or more than 2 stems have been found, return null
        if (stems.size() != 1) {
            return null;
        }
        String stem = stems.iterator().next();
        // if the stem form has non-alphanumerical chars, return null
        if (!stem.matches("[a-zA-Z0-9-]+")) {
            return null;
        }
        return stem;
    } catch (IOException ioe) {
    }
    return null;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StringReader(java.io.StringReader) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) IOException(java.io.IOException) HashSet(java.util.HashSet)

Example 32 with StandardTokenizer

use of org.apache.lucene.analysis.standard.StandardTokenizer in project sukija by ahomansikka.

the class MapMaker method read.

private void read(Reader reader) throws IOException {
    TokenStream t = new StandardTokenizer();
    ((Tokenizer) t).setReader(reader);
    CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
    try {
        t.reset();
        while (t.incrementToken()) {
            final String word = termAtt.toString();
            // System.out.println (word);
            if (wordOK(word.toLowerCase())) {
                set.add(word);
            }
        }
    } catch (IllegalArgumentException e) {
        System.out.println(e.getMessage());
        System.out.println(termAtt.toString());
        System.err.println(e.getMessage());
        System.err.println(termAtt.toString());
    } finally {
        t.close();
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer)

Example 33 with StandardTokenizer

use of org.apache.lucene.analysis.standard.StandardTokenizer in project jena by apache.

the class TestSelectiveFoldingFilter method collectTokens.

/**
 * Return the list of CharTermAttribute converted to a list of String's.
 *
 * @param whitelisted white-list
 * @return list of CharTermAttribute converted to a list of String's
 * @throws IOException from Lucene API
 */
private List<String> collectTokens(StringReader inputText, CharArraySet whitelisted) throws IOException {
    StandardTokenizer tokenizer = new StandardTokenizer();
    tokenizer.setReader(inputText);
    try (SelectiveFoldingFilter selectiveFoldingFilter = new SelectiveFoldingFilter(tokenizer, whitelisted)) {
        CharTermAttribute termAttrib = selectiveFoldingFilter.getAttribute(CharTermAttribute.class);
        selectiveFoldingFilter.reset();
        List<String> tokens = new ArrayList<>();
        while (selectiveFoldingFilter.incrementToken()) {
            tokens.add(termAttrib.toString());
        }
        selectiveFoldingFilter.end();
        return tokens;
    }
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) ArrayList(java.util.ArrayList)

Example 34 with StandardTokenizer

use of org.apache.lucene.analysis.standard.StandardTokenizer in project crate by crate.

the class FingerprintAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String s) {
    final Tokenizer tokenizer = new StandardTokenizer();
    TokenStream stream = tokenizer;
    stream = new LowerCaseFilter(stream);
    stream = new ASCIIFoldingFilter(stream, false);
    stream = new StopFilter(stream, stopWords);
    stream = new FingerprintFilter(stream, maxOutputSize, separator);
    return new TokenStreamComponents(tokenizer, stream);
}
Also used : FingerprintFilter(org.apache.lucene.analysis.miscellaneous.FingerprintFilter) TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 35 with StandardTokenizer

use of org.apache.lucene.analysis.standard.StandardTokenizer in project crate by crate.

the class SnowballAnalyzer method createComponents.

/**
 * Constructs a {@link StandardTokenizer} filtered by a {@link
 *        StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
 *        and a {@link SnowballFilter}
 */
@Override
public TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer tokenizer = new StandardTokenizer();
    TokenStream result = tokenizer;
    // remove the possessive 's for english stemmers
    if (name.equals("English") || name.equals("Porter") || name.equals("Lovins"))
        result = new EnglishPossessiveFilter(result);
    // Use a special lowercase filter for turkish, the stemmer expects it.
    if (name.equals("Turkish")) {
        result = new TurkishLowerCaseFilter(result);
    } else {
        result = new LowerCaseFilter(result);
    }
    if (stopSet != null) {
        result = new StopFilter(result, stopSet);
    }
    result = new SnowballFilter(result, name);
    return new TokenStreamComponents(tokenizer, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EnglishPossessiveFilter(org.apache.lucene.analysis.en.EnglishPossessiveFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.StopFilter) TurkishLowerCaseFilter(org.apache.lucene.analysis.tr.TurkishLowerCaseFilter) SnowballFilter(org.apache.lucene.analysis.snowball.SnowballFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) TurkishLowerCaseFilter(org.apache.lucene.analysis.tr.TurkishLowerCaseFilter)

Aggregations

StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)82 Tokenizer (org.apache.lucene.analysis.Tokenizer)68 TokenStream (org.apache.lucene.analysis.TokenStream)57 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)43 StopFilter (org.apache.lucene.analysis.StopFilter)43 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)36 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)35 StringReader (java.io.StringReader)18 SnowballFilter (org.apache.lucene.analysis.snowball.SnowballFilter)16 Analyzer (org.apache.lucene.analysis.Analyzer)10 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)10 ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)7 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)6 ElisionFilter (org.apache.lucene.analysis.util.ElisionFilter)6 DecimalDigitFilter (org.apache.lucene.analysis.core.DecimalDigitFilter)5 StopFilter (org.apache.lucene.analysis.core.StopFilter)5 ESTestCase (org.elasticsearch.test.ESTestCase)5 HashMap (java.util.HashMap)4 TokenFilter (org.apache.lucene.analysis.TokenFilter)4 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)4