use of org.apache.lucene.analysis.core.LowerCaseTokenizer in project lucene-solr by apache.
the class TestCharTokenizers method testMaxWordLength.
/*
* tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens
*/
public void testMaxWordLength() throws IOException {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < 255; i++) {
builder.append("A");
}
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT) });
}
use of org.apache.lucene.analysis.core.LowerCaseTokenizer in project lucene-solr by apache.
the class TestCharTokenizers method testMaxWordLengthWithSupplementary.
/*
* tests the max word length of 255 with a surrogate pair at position 255
*/
public void testMaxWordLengthWithSupplementary() throws IOException {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < 254; i++) {
builder.append("A");
}
builder.append("𐐜");
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT) });
}
use of org.apache.lucene.analysis.core.LowerCaseTokenizer in project lucene-solr by apache.
the class TestGermanAnalyzer method testWithKeywordAttribute.
public void testWithKeywordAttribute() throws IOException {
CharArraySet set = new CharArraySet(1, true);
set.add("fischen");
final LowerCaseTokenizer in = new LowerCaseTokenizer();
in.setReader(new StringReader("Fischen Trinken"));
GermanStemFilter filter = new GermanStemFilter(new SetKeywordMarkerFilter(in, set));
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
use of org.apache.lucene.analysis.core.LowerCaseTokenizer in project lucene-solr by apache.
the class TestCharTokenizers method testCustomMaxTokenLength.
/*
* tests the max word length passed as parameter - tokenizer will split at the passed position char no matter what happens
*/
public void testCustomMaxTokenLength() throws IOException {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < 100; i++) {
builder.append("A");
}
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 100);
// Tricky, passing two copies of the string to the reader....
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT) });
Exception e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), -1));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString(), builder.toString() });
// Let's test that we can get a token longer than 255 through.
builder.setLength(0);
for (int i = 0; i < 500; i++) {
builder.append("Z");
}
tokenizer = new LetterTokenizer(newAttributeFactory(), 500);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
// Just to be sure what is happening here, token lengths of zero make no sense,
// Let's try the edge cases, token > I/O buffer (4096)
builder.setLength(0);
for (int i = 0; i < 600; i++) {
// 600 * 8 = 4800 chars.
builder.append("aUrOkIjq");
}
e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), 10_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT) });
e = expectThrows(IllegalArgumentException.class, () -> new KeywordTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () -> new KeywordTokenizer(newAttributeFactory(), 10_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
tokenizer = new KeywordTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
e = expectThrows(IllegalArgumentException.class, () -> new LetterTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () -> new LetterTokenizer(newAttributeFactory(), 2_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 2000000", e.getMessage());
tokenizer = new LetterTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizer(newAttributeFactory(), 3_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 3000000", e.getMessage());
tokenizer = new WhitespaceTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
}
use of org.apache.lucene.analysis.core.LowerCaseTokenizer in project lucene-solr by apache.
the class TestCharTokenizers method testExtendCharBuffer.
/*
* test to extend the buffer TermAttribute buffer internally. If the internal
* alg that extends the size of the char array only extends by 1 char and the
* next char to be filled in is a supplementary codepoint (using 2 chars) an
* index out of bound exception is triggered.
*/
public void testExtendCharBuffer() throws IOException {
for (int i = 0; i < 40; i++) {
StringBuilder builder = new StringBuilder();
for (int j = 0; j < 1 + i; j++) {
builder.append("a");
}
builder.append("𐐜abc");
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT) });
}
}
Aggregations