use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.
the class TestWordDelimiterGraphFilter method testEmptyTerm.
public void testEmptyTerm() throws IOException {
Random random = random();
for (int i = 0; i < 512; i++) {
final int flags = i;
final CharArraySet protectedWords;
if (random.nextBoolean()) {
protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
} else {
protectedWords = null;
}
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protectedWords));
}
};
// depending upon options, this thing may or may not preserve the empty term
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
a.close();
}
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.
the class TestWordDelimiterGraphFilter method testRandomStrings.
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
int numIterations = atLeast(5);
for (int i = 0; i < numIterations; i++) {
final int flags = random().nextInt(512);
final CharArraySet protectedWords;
if (random().nextBoolean()) {
protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
} else {
protectedWords = null;
}
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protectedWords));
}
};
// TODO: properly support positionLengthAttribute
checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER, 20, false, false);
a.close();
}
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.
the class TestWordDelimiterGraphFilter method testProtectedWords.
public void testProtectedWords() throws Exception {
TokenStream tokens = new CannedTokenStream(new Token("foo17-bar", 0, 9), new Token("foo-bar", 0, 7));
CharArraySet protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("foo17-BAR")), true);
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(tokens, DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | CATENATE_ALL, protectedWords);
assertGraphStrings(wdf, "foo17-bar foo bar", "foo17-bar foo-bar", "foo17-bar foobar");
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.
the class TestWordDelimiterGraphFilter method testLotsOfConcatenating2.
/** concat numbers + words + all + preserve original */
public void testLotsOfConcatenating2() throws Exception {
final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
/* analyzer that uses whitespace + wdf */
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, null));
}
};
assertAnalyzesTo(a, "abc-def-123-456", new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" }, new int[] { 0, 0, 0, 0, 4, 8, 8, 12 }, new int[] { 15, 15, 7, 3, 7, 15, 11, 15 }, null, new int[] { 1, 0, 0, 0, 1, 1, 0, 1 }, null, false);
a.close();
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.
the class TestWordDelimiterGraphFilter method doSplit.
public void doSplit(final String input, String... output) throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(keywordMockTokenizer(input), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, output);
}
Aggregations