use of org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer in project lucene-solr by apache.
the class TestUAX29URLEmailTokenizer method testHugeDoc.
public void testHugeDoc() throws IOException {
StringBuilder sb = new StringBuilder();
char[] whitespace = new char[4094];
Arrays.fill(whitespace, ' ');
sb.append(whitespace);
sb.append("testing 1234");
String input = sb.toString();
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
tokenizer.setReader(new StringReader(input));
BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}
use of org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer in project lucene-solr by apache.
the class TestUAX29URLEmailTokenizer method setUp.
@Override
public void setUp() throws Exception {
super.setUp();
a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
return new TokenStreamComponents(tokenizer);
}
};
urlAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
// Tokenize arbitrary length URLs
tokenizer.setMaxTokenLength(UAX29URLEmailTokenizer.MAX_TOKEN_LENGTH_LIMIT);
TokenFilter filter = new URLFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
emailAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
TokenFilter filter = new EmailFilter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
}
use of org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer in project lucene-solr by apache.
the class TestUAX29URLEmailTokenizer method testLongEMAILatomText.
// LUCENE-5440: extremely slow tokenization of text matching email <local-part> (before the '@')
@Slow
public void testLongEMAILatomText() throws Exception {
// EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
char[] emailAtomChars = "!#$%&'*+,-./0123456789=?ABCDEFGHIJKLMNOPQRSTUVWXYZ^_`abcdefghijklmnopqrstuvwxyz{|}~".toCharArray();
StringBuilder builder = new StringBuilder();
int numChars = TestUtil.nextInt(random(), 100 * 1024, 3 * 1024 * 1024);
for (int i = 0; i < numChars; ++i) {
builder.append(emailAtomChars[random().nextInt(emailAtomChars.length)]);
}
int tokenCount = 0;
UAX29URLEmailTokenizer ts = new UAX29URLEmailTokenizer();
String text = builder.toString();
ts.setReader(new StringReader(text));
ts.reset();
while (ts.incrementToken()) {
tokenCount++;
}
ts.end();
ts.close();
assertTrue(tokenCount > 0);
tokenCount = 0;
int newBufferSize = TestUtil.nextInt(random(), 200, 8192);
ts.setMaxTokenLength(newBufferSize);
ts.setReader(new StringReader(text));
ts.reset();
while (ts.incrementToken()) {
tokenCount++;
}
ts.end();
ts.close();
assertTrue(tokenCount > 0);
}
use of org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer in project elasticsearch by elastic.
the class UAX29URLEmailTokenizerFactory method create.
@Override
public Tokenizer create() {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer();
tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer;
}
use of org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer in project crate by crate.
the class UAX29URLEmailTokenizerFactory method create.
@Override
public Tokenizer create() {
UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer();
tokenizer.setMaxTokenLength(maxTokenLength);
return tokenizer;
}
Aggregations