use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.
the class TestStemmerOverrideFilter method testOverride.
public void testOverride() throws IOException {
// lets make booked stem to books
// the override filter will convert "booked" to "books",
// but also mark it with KeywordAttribute so Porter will not change it.
StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder();
builder.add("booked", "books");
Tokenizer tokenizer = keywordTokenizer("booked");
TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.build()));
assertTokenStreamContents(stream, new String[] { "books" });
}
use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.
the class TestScandinavianNormalizationFilter method testEmptyTerm.
/** check that the empty string doesn't cause issues */
public void testEmptyTerm() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new ScandinavianNormalizationFilter(tokenizer));
}
};
checkOneTerm(a, "", "");
a.close();
}
use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.
the class TestStemmerOverrideFilter method testRandomRealisticKeyword.
public void testRandomRealisticKeyword() throws IOException {
Map<String, String> map = new HashMap<>();
int numTerms = atLeast(50);
for (int i = 0; i < numTerms; i++) {
String randomRealisticUnicodeString = TestUtil.randomRealisticUnicodeString(random());
if (randomRealisticUnicodeString.length() > 0) {
String value = TestUtil.randomSimpleString(random());
map.put(randomRealisticUnicodeString, value.isEmpty() ? "a" : value);
}
}
if (map.isEmpty()) {
map.put("booked", "books");
}
// This test might fail if ignoreCase is true since the map might have twice the same key, once
// lowercased and once uppercased
StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
Set<Entry<String, String>> entrySet = map.entrySet();
for (Entry<String, String> entry : entrySet) {
builder.add(entry.getKey(), entry.getValue());
}
StemmerOverrideMap build = builder.build();
for (Entry<String, String> entry : entrySet) {
if (random().nextBoolean()) {
Tokenizer tokenizer = new KeywordTokenizer();
tokenizer.setReader(new StringReader(entry.getKey()));
TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, build));
assertTokenStreamContents(stream, new String[] { entry.getValue() });
}
}
}
use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.
the class NGramTokenFilterTest method testEmptyTerm.
public void testEmptyTerm() throws Exception {
Random random = random();
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new NGramTokenFilter(tokenizer, 2, 15));
}
};
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
a.close();
}
use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.
the class NGramTokenFilterTest method testSupplementaryCharacters.
public void testSupplementaryCharacters() throws IOException {
final String s = TestUtil.randomUnicodeString(random(), 10);
final int codePointCount = s.codePointCount(0, s.length());
final int minGram = TestUtil.nextInt(random(), 1, 3);
final int maxGram = TestUtil.nextInt(random(), minGram, 10);
TokenStream tk = new KeywordTokenizer();
((Tokenizer) tk).setReader(new StringReader(s));
tk = new NGramTokenFilter(tk, minGram, maxGram);
final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
tk.reset();
for (int start = 0; start < codePointCount; ++start) {
for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
assertTrue(tk.incrementToken());
assertEquals(0, offsetAtt.startOffset());
assertEquals(s.length(), offsetAtt.endOffset());
final int startIndex = Character.offsetByCodePoints(s, 0, start);
final int endIndex = Character.offsetByCodePoints(s, 0, end);
assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
}
}
assertFalse(tk.incrementToken());
}
Aggregations