use of org.apache.lucene.analysis.standard.StandardTokenizer in project symja_android_library by axkr.
the class Pods method getStemForm.
private static String getStemForm(String term) {
StandardTokenizer stdToken = new StandardTokenizer();
stdToken.setReader(new StringReader(term));
try (TokenStream tokenStream = new PorterStemFilter(stdToken)) {
tokenStream.reset();
// eliminate duplicate tokens by adding them to a set
Set<String> stems = new HashSet<>();
CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
stems.add(token.toString());
}
// if stem form was not found or more than 2 stems have been found, return null
if (stems.size() != 1) {
return null;
}
String stem = stems.iterator().next();
// if the stem form has non-alphanumerical chars, return null
if (!stem.matches("[a-zA-Z0-9-]+")) {
return null;
}
return stem;
} catch (IOException ioe) {
}
return null;
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project sukija by ahomansikka.
the class MapMaker method read.
private void read(Reader reader) throws IOException {
TokenStream t = new StandardTokenizer();
((Tokenizer) t).setReader(reader);
CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
try {
t.reset();
while (t.incrementToken()) {
final String word = termAtt.toString();
// System.out.println (word);
if (wordOK(word.toLowerCase())) {
set.add(word);
}
}
} catch (IllegalArgumentException e) {
System.out.println(e.getMessage());
System.out.println(termAtt.toString());
System.err.println(e.getMessage());
System.err.println(termAtt.toString());
} finally {
t.close();
}
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project jena by apache.
the class TestSelectiveFoldingFilter method collectTokens.
/**
* Return the list of CharTermAttribute converted to a list of String's.
*
* @param whitelisted white-list
* @return list of CharTermAttribute converted to a list of String's
* @throws IOException from Lucene API
*/
private List<String> collectTokens(StringReader inputText, CharArraySet whitelisted) throws IOException {
StandardTokenizer tokenizer = new StandardTokenizer();
tokenizer.setReader(inputText);
try (SelectiveFoldingFilter selectiveFoldingFilter = new SelectiveFoldingFilter(tokenizer, whitelisted)) {
CharTermAttribute termAttrib = selectiveFoldingFilter.getAttribute(CharTermAttribute.class);
selectiveFoldingFilter.reset();
List<String> tokens = new ArrayList<>();
while (selectiveFoldingFilter.incrementToken()) {
tokens.add(termAttrib.toString());
}
selectiveFoldingFilter.end();
return tokens;
}
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project crate by crate.
the class FingerprintAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String s) {
final Tokenizer tokenizer = new StandardTokenizer();
TokenStream stream = tokenizer;
stream = new LowerCaseFilter(stream);
stream = new ASCIIFoldingFilter(stream, false);
stream = new StopFilter(stream, stopWords);
stream = new FingerprintFilter(stream, maxOutputSize, separator);
return new TokenStreamComponents(tokenizer, stream);
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project crate by crate.
the class SnowballAnalyzer method createComponents.
/**
* Constructs a {@link StandardTokenizer} filtered by a {@link
* StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter},
* and a {@link SnowballFilter}
*/
@Override
public TokenStreamComponents createComponents(String fieldName) {
final Tokenizer tokenizer = new StandardTokenizer();
TokenStream result = tokenizer;
// remove the possessive 's for english stemmers
if (name.equals("English") || name.equals("Porter") || name.equals("Lovins"))
result = new EnglishPossessiveFilter(result);
// Use a special lowercase filter for turkish, the stemmer expects it.
if (name.equals("Turkish")) {
result = new TurkishLowerCaseFilter(result);
} else {
result = new LowerCaseFilter(result);
}
if (stopSet != null) {
result = new StopFilter(result, stopSet);
}
result = new SnowballFilter(result, name);
return new TokenStreamComponents(tokenizer, result);
}
Aggregations