use of org.apache.lucene.analysis.core.LetterTokenizer in project lucene-solr by apache.
the class TestCharTokenizers method testCrossPlaneNormalization2.
// LUCENE-3642: normalize BMP->SMP and check that offsets are correct
public void testCrossPlaneNormalization2() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) {
@Override
protected int normalize(int c) {
if (c <= 0xffff) {
return 0x1043C;
} else {
return c;
}
}
};
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
int num = 1000 * RANDOM_MULTIPLIER;
for (int i = 0; i < num; i++) {
String s = TestUtil.randomUnicodeString(random());
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
ts.reset();
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
while (ts.incrementToken()) {
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
cp = highlightedText.codePointAt(j);
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
}
}
ts.end();
}
}
// just for fun
checkRandomData(random(), analyzer, num);
analyzer.close();
}
use of org.apache.lucene.analysis.core.LetterTokenizer in project lucene-solr by apache.
the class EdgeNGramTokenFilterTest method testGraphs.
public void testGraphs() throws IOException {
TokenStream tk = new LetterTokenizer();
((Tokenizer) tk).setReader(new StringReader("abc d efgh ij klmno p q"));
tk = new ShingleFilter(tk);
tk = new EdgeNGramTokenFilter(tk, 7, 10);
assertTokenStreamContents(tk, new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23);
}
use of org.apache.lucene.analysis.core.LetterTokenizer in project lucene-solr by apache.
the class TestCharTokenizers method testCustomMaxTokenLength.
/*
* tests the max word length passed as parameter - tokenizer will split at the passed position char no matter what happens
*/
public void testCustomMaxTokenLength() throws IOException {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < 100; i++) {
builder.append("A");
}
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 100);
// Tricky, passing two copies of the string to the reader....
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT) });
Exception e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), -1));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString(), builder.toString() });
// Let's test that we can get a token longer than 255 through.
builder.setLength(0);
for (int i = 0; i < 500; i++) {
builder.append("Z");
}
tokenizer = new LetterTokenizer(newAttributeFactory(), 500);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
// Just to be sure what is happening here, token lengths of zero make no sense,
// Let's try the edge cases, token > I/O buffer (4096)
builder.setLength(0);
for (int i = 0; i < 600; i++) {
// 600 * 8 = 4800 chars.
builder.append("aUrOkIjq");
}
e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), 10_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT) });
e = expectThrows(IllegalArgumentException.class, () -> new KeywordTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () -> new KeywordTokenizer(newAttributeFactory(), 10_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
tokenizer = new KeywordTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
e = expectThrows(IllegalArgumentException.class, () -> new LetterTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () -> new LetterTokenizer(newAttributeFactory(), 2_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 2000000", e.getMessage());
tokenizer = new LetterTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizer(newAttributeFactory(), 3_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 3000000", e.getMessage());
tokenizer = new WhitespaceTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
}
use of org.apache.lucene.analysis.core.LetterTokenizer in project lucene-solr by apache.
the class TestCharTokenizers method testCrossPlaneNormalization.
// LUCENE-3642: normalize SMP->BMP and check that offsets are correct
public void testCrossPlaneNormalization() throws IOException {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) {
@Override
protected int normalize(int c) {
if (c > 0xffff) {
return 'δ';
} else {
return c;
}
}
};
return new TokenStreamComponents(tokenizer, tokenizer);
}
};
int num = 1000 * RANDOM_MULTIPLIER;
for (int i = 0; i < num; i++) {
String s = TestUtil.randomUnicodeString(random());
try (TokenStream ts = analyzer.tokenStream("foo", s)) {
ts.reset();
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
while (ts.incrementToken()) {
String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
cp = highlightedText.codePointAt(j);
assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
}
}
ts.end();
}
}
// just for fun
checkRandomData(random(), analyzer, num);
analyzer.close();
}
Aggregations