use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.
the class TestCompoundWordTokenFilter method testRandomStrings.
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict));
}
};
checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
a.close();
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator);
return new TokenStreamComponents(tokenizer, filter);
}
};
checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER);
b.close();
}
use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.
the class TestSoraniAnalyzer method testWithStemExclusionSet.
public void testWithStemExclusionSet() throws IOException {
CharArraySet set = new CharArraySet(1, true);
set.add("پیاوە");
Analyzer a = new SoraniAnalyzer(CharArraySet.EMPTY_SET, set);
assertAnalyzesTo(a, "پیاوە", new String[] { "پیاوە" });
a.close();
}
use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.
the class TestNorwegianLightStemFilter method testKeyword.
public void testKeyword() throws IOException {
final CharArraySet exclusionSet = new CharArraySet(asSet("sekretæren"), false);
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink));
}
};
checkOneTerm(a, "sekretæren", "sekretæren");
a.close();
}
use of org.apache.lucene.analysis.CharArraySet in project jena by apache.
the class StandardAnalyzerAssembler method analyzerWithStopWords.
private Analyzer analyzerWithStopWords(Resource root) {
RDFNode node = root.getProperty(TextVocab.pStopWords).getObject();
if (!node.isResource()) {
throw new TextIndexException("text:stopWords property takes a list as a value : " + node);
}
CharArraySet stopWords = toCharArraySet((Resource) node);
return new StandardAnalyzer(stopWords);
}
use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.
the class TestGermanAnalyzer method testWithKeywordAttribute.
public void testWithKeywordAttribute() throws IOException {
CharArraySet set = new CharArraySet(1, true);
set.add("fischen");
final LowerCaseTokenizer in = new LowerCaseTokenizer();
in.setReader(new StringReader("Fischen Trinken"));
GermanStemFilter filter = new GermanStemFilter(new SetKeywordMarkerFilter(in, set));
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
Aggregations