use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.
the class TestWordDelimiterGraphFilter method testPositionIncrements.
public void testPositionIncrements() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
/* analyzer that uses whitespace + wdf */
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protWords));
}
};
/* in this case, works as expected. */
assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, null, new int[] { 1, 2 }, null, false);
/* only in this case, posInc of 2 ?! */
assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "solR", "sol", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 13, 12, 13 }, null, new int[] { 1, 2, 0, 1 }, null, false);
assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, null, new int[] { 1, 2, 1 }, null, false);
/* analyzer that will consume tokens with large position increments */
Analyzer a2 = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(new LargePosIncTokenFilter(tokenizer), flags, protWords));
}
};
/* increment of "largegap" is preserved */
assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" }, new int[] { 0, 7, 16 }, new int[] { 6, 15, 20 }, null, new int[] { 1, 10, 1 }, null, false);
/* the "/" had a position increment of 10, where did it go?!?!! */
assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, null, new int[] { 1, 11 }, null, false);
/* in this case, the increment of 10 from the "/" is carried over */
assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "solR", "sol", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 13, 12, 13 }, null, new int[] { 1, 11, 0, 1 }, null, false);
assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, null, new int[] { 1, 11, 1 }, null, false);
Analyzer a3 = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
StopFilter filter = new StopFilter(tokenizer, StandardAnalyzer.STOP_WORDS_SET);
return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(filter, flags, protWords));
}
};
assertAnalyzesTo(a3, "lucene.solr", new String[] { "lucenesolr", "lucene", "solr" }, new int[] { 0, 0, 7 }, new int[] { 11, 6, 11 }, null, new int[] { 1, 0, 1 }, null, false);
/* the stopword should add a gap here */
assertAnalyzesTo(a3, "the lucene.solr", new String[] { "lucenesolr", "lucene", "solr" }, new int[] { 4, 4, 11 }, new int[] { 15, 10, 15 }, null, new int[] { 2, 0, 1 }, null, false);
IOUtils.close(a, a2, a3);
}
Aggregations