use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.
the class TestWordDelimiterFilter method testLotsOfConcatenating2.
/** concat numbers + words + all + preserve original */
public void testLotsOfConcatenating2() throws Exception {
final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
/* analyzer that uses whitespace + wdf */
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
}
};
assertAnalyzesTo(a, "abc-def-123-456", new String[] { "abc-def-123-456", "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, new int[] { 0, 0, 0, 0, 4, 8, 8, 12 }, new int[] { 15, 3, 7, 15, 7, 11, 15, 15 }, null, new int[] { 1, 0, 0, 0, 1, 1, 0, 1 }, null, false);
a.close();
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.
the class TestWordDelimiterFilter method testOffsets.
/*
public void testPerformance() throws IOException {
String s = "now is the time-for all good men to come to-the aid of their country.";
Token tok = new Token();
long start = System.currentTimeMillis();
int ret=0;
for (int i=0; i<1000000; i++) {
StringReader r = new StringReader(s);
TokenStream ts = new WhitespaceTokenizer(r);
ts = new WordDelimiterFilter(ts, 1,1,1,1,0);
while (ts.next(tok) != null) ret++;
}
System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start));
}
***/
@Test
public void testOffsets() throws IOException {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
// test that subwords and catenated subwords have
// the correct offsets.
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, new String[] { "foo", "foobar", "bar" }, new int[] { 5, 5, 9 }, new int[] { 8, 12, 12 });
wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, new String[] { "foo", "bar", "foobar" }, new int[] { 5, 5, 5 }, new int[] { 6, 6, 6 });
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.
the class TestWordDelimiterFilter method testEmptyTerm.
public void testEmptyTerm() throws IOException {
Random random = random();
for (int i = 0; i < 512; i++) {
final int flags = i;
final CharArraySet protectedWords;
if (random.nextBoolean()) {
protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
} else {
protectedWords = null;
}
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new KeywordTokenizer();
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
}
};
// depending upon options, this thing may or may not preserve the empty term
checkAnalysisConsistency(random, a, random.nextBoolean(), "");
a.close();
}
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.
the class TestWordDelimiterFilter method testNumberPunct.
public void testNumberPunct() throws Exception {
int flags = GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
}
};
assertAnalyzesTo(a, "6-", new String[] { "6" }, new int[] { 0 }, new int[] { 1 }, null, new int[] { 1 }, null, false);
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.
the class TestWordDelimiterFilter method testOffsetChange4.
@Test
public void testOffsetChange4() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, new String[] { "foo", "foobar", "bar" }, new int[] { 8, 8, 12 }, new int[] { 11, 15, 15 });
}
Aggregations