use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.
the class TestKeepWordFilter method testStopAndGo.
public void testStopAndGo() throws Exception {
Set<String> words = new HashSet<>();
words.add("aaa");
words.add("bbb");
String input = "xxx yyy aaa zzz BBB ccc ddd EEE";
// Test Stopwords
TokenStream stream = whitespaceMockTokenizer(input);
stream = new KeepWordFilter(stream, new CharArraySet(words, true));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
// Now force case
stream = whitespaceMockTokenizer(input);
stream = new KeepWordFilter(stream, new CharArraySet(words, false));
assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
}
use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.
the class TestKeepWordFilter method testRandomStrings.
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
final Set<String> words = new HashSet<>();
words.add("a");
words.add("b");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenStream stream = new KeepWordFilter(tokenizer, new CharArraySet(words, true));
return new TokenStreamComponents(tokenizer, stream);
}
};
checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
a.close();
}
use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.
the class TestCapitalizationFilter method testCapitalization.
public void testCapitalization() throws Exception {
CharArraySet keep = new CharArraySet(Arrays.asList("and", "the", "it", "BIG"), false);
assertCapitalizesTo("kiTTEN", new String[] { "Kitten" }, true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
assertCapitalizesTo("and", new String[] { "And" }, true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
assertCapitalizesTo("AnD", new String[] { "And" }, true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
//first is not forced, but it's not a keep word, either
assertCapitalizesTo("AnD", new String[] { "And" }, true, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
assertCapitalizesTo("big", new String[] { "Big" }, true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
assertCapitalizesTo("BIG", new String[] { "BIG" }, true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
assertCapitalizesToKeyword("Hello thEre my Name is Ryan", "Hello there my name is ryan", true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
// now each token
assertCapitalizesTo("Hello thEre my Name is Ryan", new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }, false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
// now only the long words
assertCapitalizesTo("Hello thEre my Name is Ryan", new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }, false, keep, true, null, 3, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
// without prefix
assertCapitalizesTo("McKinley", new String[] { "Mckinley" }, true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
// Now try some prefixes
List<char[]> okPrefix = new ArrayList<>();
okPrefix.add("McK".toCharArray());
assertCapitalizesTo("McKinley", new String[] { "McKinley" }, true, keep, true, okPrefix, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
// now try some stuff with numbers
assertCapitalizesTo("1st 2nd third", new String[] { "1st", "2nd", "Third" }, false, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
assertCapitalizesToKeyword("the The the", "The The the", false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
}
use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.
the class TestTurkishAnalyzer method testExclude.
/** test use of exclusion set */
public void testExclude() throws IOException {
CharArraySet exclusionSet = new CharArraySet(asSet("ağacı"), false);
Analyzer a = new TurkishAnalyzer(TurkishAnalyzer.getDefaultStopSet(), exclusionSet);
checkOneTerm(a, "ağacı", "ağacı");
checkOneTerm(a, "ağaç", "ağaç");
a.close();
}
use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.
the class TestGermanStemFilter method testKeyword.
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(asSet("sängerinnen"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new GermanStemFilter(sink));
}
};
checkOneTerm(a, "sängerinnen", "sängerinnen");
a.close();
}
Aggregations