use of org.apache.lucene.analysis.en.PorterStemFilter in project cogcomp-nlp by CogComp.
the class MinimalAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new StandardFilter(source);
result = new ASCIIFoldingFilter(result);
result = new LowerCaseFilter(result);
result = new EnglishPossessiveFilter(result);
result = new StopFilter(result, stopwords);
result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
result = new PorterStemFilter(result);
return new TokenStreamComponents(source, result);
}
use of org.apache.lucene.analysis.en.PorterStemFilter in project lucene-solr by apache.
the class TestStemmerOverrideFilter method testIgnoreCase.
public void testIgnoreCase() throws IOException {
// lets make booked stem to books
// the override filter will convert "booked" to "books",
// but also mark it with KeywordAttribute so Porter will not change it.
StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
builder.add("boOkEd", "books");
Tokenizer tokenizer = keywordTokenizer("BooKeD");
TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.build()));
assertTokenStreamContents(stream, new String[] { "books" });
}
use of org.apache.lucene.analysis.en.PorterStemFilter in project lucene-solr by apache.
the class TestStemmerOverrideFilter method testNoOverrides.
public void testNoOverrides() throws IOException {
StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true);
Tokenizer tokenizer = keywordTokenizer("book");
TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.build()));
assertTokenStreamContents(stream, new String[] { "book" });
}
use of org.apache.lucene.analysis.en.PorterStemFilter in project lucene-solr by apache.
the class TestStemmerOverrideFilter method testRandomRealisticWhiteSpace.
public void testRandomRealisticWhiteSpace() throws IOException {
Map<String, String> map = new HashMap<>();
Set<String> seen = new HashSet<>();
int numTerms = atLeast(50);
boolean ignoreCase = random().nextBoolean();
for (int i = 0; i < numTerms; i++) {
String randomRealisticUnicodeString = TestUtil.randomRealisticUnicodeString(random());
char[] charArray = randomRealisticUnicodeString.toCharArray();
StringBuilder builder = new StringBuilder();
for (int j = 0; j < charArray.length; ) {
int cp = Character.codePointAt(charArray, j, charArray.length);
if (!Character.isWhitespace(cp)) {
builder.appendCodePoint(cp);
}
j += Character.charCount(cp);
}
if (builder.length() > 0) {
String inputValue = builder.toString();
// Make sure we don't try to add two inputs that vary only by case:
String seenInputValue;
if (ignoreCase) {
// TODO: can we simply use inputValue.toLowerCase(Locale.ROOT)???
char[] buffer = inputValue.toCharArray();
CharacterUtils.toLowerCase(buffer, 0, buffer.length);
seenInputValue = buffer.toString();
} else {
seenInputValue = inputValue;
}
if (seen.contains(seenInputValue) == false) {
seen.add(seenInputValue);
String value = TestUtil.randomSimpleString(random());
map.put(inputValue, value.isEmpty() ? "a" : value);
}
}
}
if (map.isEmpty()) {
map.put("booked", "books");
}
StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase);
Set<Entry<String, String>> entrySet = map.entrySet();
StringBuilder input = new StringBuilder();
List<String> output = new ArrayList<>();
for (Entry<String, String> entry : entrySet) {
builder.add(entry.getKey(), entry.getValue());
if (random().nextBoolean() || output.isEmpty()) {
input.append(entry.getKey()).append(" ");
output.add(entry.getValue());
}
}
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(input.toString()));
TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.build()));
assertTokenStreamContents(stream, output.toArray(new String[0]));
}
use of org.apache.lucene.analysis.en.PorterStemFilter in project nutch by apache.
the class LuceneAnalyzerUtil method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new ClassicTokenizer();
TokenStream filter = new LowerCaseFilter(source);
if (stopSet != null) {
filter = new StopFilter(filter, stopSet);
}
switch(stemFilterType) {
case PORTERSTEM_FILTER:
filter = new PorterStemFilter(filter);
break;
case ENGLISHMINIMALSTEM_FILTER:
filter = new EnglishMinimalStemFilter(filter);
break;
default:
break;
}
return new TokenStreamComponents(source, filter);
}
Aggregations