use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class TestSynonymMapFilter method testRecursion2.
public void testRecursion2() throws Exception {
b = new SynonymMap.Builder(true);
final boolean keepOrig = false;
add("zoo", "zoo", keepOrig);
add("zoo", "zoo zoo", keepOrig);
final SynonymMap map = b.build();
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new SynonymFilter(tokenizer, map, true));
}
};
// verify("zoo zoo $ zoo", "zoo/zoo zoo/zoo/zoo $/zoo zoo/zoo zoo");
assertAnalyzesTo(a, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo", "zoo" }, new int[] { 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 });
a.close();
}
use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class TestDocument method testInvalidFields.
// LUCENE-3616
public void testInvalidFields() {
expectThrows(IllegalArgumentException.class, () -> {
Tokenizer tok = new MockTokenizer();
tok.setReader(new StringReader(""));
new Field("foo", tok, StringField.TYPE_STORED);
});
}
use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class TestIndexWriter method testStopwordsPosIncHole.
// LUCENE-3849
public void testStopwordsPosIncHole() throws Exception {
Directory dir = newDirectory();
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer();
TokenStream stream = new MockTokenFilter(tokenizer, MockTokenFilter.ENGLISH_STOPSET);
return new TokenStreamComponents(tokenizer, stream);
}
};
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, a);
Document doc = new Document();
doc.add(new TextField("body", "just a", Field.Store.NO));
doc.add(new TextField("body", "test of gaps", Field.Store.NO));
iw.addDocument(doc);
IndexReader ir = iw.getReader();
iw.close();
IndexSearcher is = newSearcher(ir);
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("body", "just"), 0);
builder.add(new Term("body", "test"), 2);
PhraseQuery pq = builder.build();
// body:"just ? test"
assertEquals(1, is.search(pq, 5).totalHits);
ir.close();
dir.close();
}
use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class CJKAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
// run the widthfilter first before bigramming, it sometimes combines characters.
TokenStream result = new CJKWidthFilter(source);
result = new LowerCaseFilter(result);
result = new CJKBigramFilter(result);
return new TokenStreamComponents(source, new StopFilter(result, stopwords));
}
use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.
the class ArabicAnalyzer method createComponents.
/**
* Creates
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents}
* built from an {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link DecimalDigitFilter}, {@link StopFilter},
* {@link ArabicNormalizationFilter}, {@link SetKeywordMarkerFilter}
* if a stem exclusion set is provided and {@link ArabicStemFilter}.
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new LowerCaseFilter(source);
result = new DecimalDigitFilter(result);
// the order here is important: the stopword list is not normalized!
result = new StopFilter(result, stopwords);
// TODO maybe we should make ArabicNormalization filter also KeywordAttribute aware?!
result = new ArabicNormalizationFilter(result);
if (!stemExclusionSet.isEmpty()) {
result = new SetKeywordMarkerFilter(result, stemExclusionSet);
}
return new TokenStreamComponents(source, new ArabicStemFilter(result));
}
Aggregations