use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project lucene-solr by apache.
the class TestCharTokenizers method testCustomMaxTokenLength.
/*
* tests the max word length passed as parameter - tokenizer will split at the passed position char no matter what happens
*/
public void testCustomMaxTokenLength() throws IOException {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < 100; i++) {
builder.append("A");
}
Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 100);
// Tricky, passing two copies of the string to the reader....
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT) });
Exception e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), -1));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString(), builder.toString() });
// Let's test that we can get a token longer than 255 through.
builder.setLength(0);
for (int i = 0; i < 500; i++) {
builder.append("Z");
}
tokenizer = new LetterTokenizer(newAttributeFactory(), 500);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
// Just to be sure what is happening here, token lengths of zero make no sense,
// Let's try the edge cases, token > I/O buffer (4096)
builder.setLength(0);
for (int i = 0; i < 600; i++) {
// 600 * 8 = 4800 chars.
builder.append("aUrOkIjq");
}
e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), 10_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT) });
e = expectThrows(IllegalArgumentException.class, () -> new KeywordTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () -> new KeywordTokenizer(newAttributeFactory(), 10_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
tokenizer = new KeywordTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
e = expectThrows(IllegalArgumentException.class, () -> new LetterTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () -> new LetterTokenizer(newAttributeFactory(), 2_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 2000000", e.getMessage());
tokenizer = new LetterTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizer(newAttributeFactory(), 0));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizer(newAttributeFactory(), 3_000_000));
assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 3000000", e.getMessage());
tokenizer = new WhitespaceTokenizer(newAttributeFactory(), 4800);
tokenizer.setReader(new StringReader(builder.toString()));
assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project lucene-solr by apache.
the class TestDaitchMokotoffSoundexFilter method assertAlgorithm.
static void assertAlgorithm(boolean inject, String input, String[] expected) throws Exception {
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(input));
DaitchMokotoffSoundexFilter filter = new DaitchMokotoffSoundexFilter(tokenizer, inject);
assertTokenStreamContents(filter, expected);
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project lucene-solr by apache.
the class TestStemmerOverrideFilter method testRandomRealisticWhiteSpace.
public void testRandomRealisticWhiteSpace() throws IOException {
Map<String, String> map = new HashMap<>();
Set<String> seen = new HashSet<>();
int numTerms = atLeast(50);
boolean ignoreCase = random().nextBoolean();
for (int i = 0; i < numTerms; i++) {
String randomRealisticUnicodeString = TestUtil.randomRealisticUnicodeString(random());
char[] charArray = randomRealisticUnicodeString.toCharArray();
StringBuilder builder = new StringBuilder();
for (int j = 0; j < charArray.length; ) {
int cp = Character.codePointAt(charArray, j, charArray.length);
if (!Character.isWhitespace(cp)) {
builder.appendCodePoint(cp);
}
j += Character.charCount(cp);
}
if (builder.length() > 0) {
String inputValue = builder.toString();
// Make sure we don't try to add two inputs that vary only by case:
String seenInputValue;
if (ignoreCase) {
// TODO: can we simply use inputValue.toLowerCase(Locale.ROOT)???
char[] buffer = inputValue.toCharArray();
CharacterUtils.toLowerCase(buffer, 0, buffer.length);
seenInputValue = buffer.toString();
} else {
seenInputValue = inputValue;
}
if (seen.contains(seenInputValue) == false) {
seen.add(seenInputValue);
String value = TestUtil.randomSimpleString(random());
map.put(inputValue, value.isEmpty() ? "a" : value);
}
}
}
if (map.isEmpty()) {
map.put("booked", "books");
}
StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(ignoreCase);
Set<Entry<String, String>> entrySet = map.entrySet();
StringBuilder input = new StringBuilder();
List<String> output = new ArrayList<>();
for (Entry<String, String> entry : entrySet) {
builder.add(entry.getKey(), entry.getValue());
if (random().nextBoolean() || output.isEmpty()) {
input.append(entry.getKey()).append(" ");
output.add(entry.getValue());
}
}
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(input.toString()));
TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.build()));
assertTokenStreamContents(stream, output.toArray(new String[0]));
}
use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project Anserini by castorini.
the class TRECAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new WhitespaceTokenizer();
TokenStream filter = new TweetLowerCaseEntityPreservingFilter(source);
return new TokenStreamComponents(source, filter);
}
Aggregations