use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.
the class TestWordDelimiterGraphFilter method testEmptyString.
public void testEmptyString() throws Exception {
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("", 0, 0)), DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | CATENATE_ALL | PRESERVE_ORIGINAL, null);
wdf.reset();
assertTrue(wdf.incrementToken());
assertFalse(wdf.incrementToken());
wdf.end();
wdf.close();
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.
the class TestWordDelimiterGraphFilter method testRandomHugeStrings.
/** blast some enormous random strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
int numIterations = atLeast(5);
for (int i = 0; i < numIterations; i++) {
final int flags = random().nextInt(512);
final CharArraySet protectedWords;
if (random().nextBoolean()) {
protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
} else {
protectedWords = null;
}
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenStream wdgf = new WordDelimiterGraphFilter(tokenizer, flags, protectedWords);
return new TokenStreamComponents(tokenizer, wdgf);
}
};
// TODO: properly support positionLengthAttribute
checkRandomData(random(), a, 20 * RANDOM_MULTIPLIER, 8192, false, false);
a.close();
}
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.
the class TestWordDelimiterGraphFilter method testOffsets.
public void testOffsets() throws IOException {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
// test that subwords and catenated subwords have
// the correct offsets.
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, new String[] { "foobar", "foo", "bar" }, new int[] { 5, 5, 9 }, new int[] { 12, 8, 12 });
// with illegal offsets:
wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, new String[] { "foobar", "foo", "bar" }, new int[] { 5, 5, 5 }, new int[] { 6, 6, 6 });
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.
the class TestWordDelimiterGraphFilter method testLotsOfConcatenating.
/** concat numbers + words + all */
public void testLotsOfConcatenating() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
/* analyzer that uses whitespace + wdf */
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, null));
}
};
assertAnalyzesTo(a, "abc-def-123-456", new String[] { "abcdef123456", "abcdef", "abc", "def", "123456", "123", "456" }, new int[] { 0, 0, 0, 4, 8, 8, 12 }, new int[] { 15, 7, 3, 7, 15, 11, 15 }, null, new int[] { 1, 0, 0, 1, 1, 0, 1 }, null, false);
a.close();
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.
the class TestWordDelimiterGraphFilter method testOffsetChange4.
public void testOffsetChange4() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, new String[] { "foobar", "foo", "bar" }, new int[] { 8, 8, 12 }, new int[] { 15, 11, 15 });
}
Aggregations