use of org.apache.lucene.analysis.core.LowerCaseTokenizer in project lucene-solr by apache.
the class TestCharTokenizers method testReadSupplementaryChars.
/*
* test to read surrogate pairs without loosing the pairing
* if the surrogate pair is at the border of the internal IO buffer
*/
public void testReadSupplementaryChars() throws IOException {
StringBuilder builder = new StringBuilder();
// create random input
int num = 1024 + random().nextInt(1024);
num *= RANDOM_MULTIPLIER;
for (int i = 1; i < num; i++) {
builder.append("đabc");
if ((i % 10) == 0)
builder.append(" ");
}
// internal buffer size is 1024 make sure we have a surrogate pair right at the border
builder.insert(1023, "đ");
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, builder.toString().toLowerCase(Locale.ROOT).split(" "));
}
use of org.apache.lucene.analysis.core.LowerCaseTokenizer in project lucene-solr by apache.
the class TestBrazilianAnalyzer method testWithKeywordAttribute.
public void testWithKeywordAttribute() throws IOException {
CharArraySet set = new CharArraySet(1, true);
set.add("BrasĂlia");
Tokenizer tokenizer = new LowerCaseTokenizer();
tokenizer.setReader(new StringReader("BrasĂlia Brasilia"));
BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(tokenizer, set));
assertTokenStreamContents(filter, new String[] { "brasĂlia", "brasil" });
}
Aggregations