use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestCJKAnalyzer method testSingleChar2.
public void testSingleChar2() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new FakeStandardTokenizer(tokenizer);
filter = new StopFilter(filter, CharArraySet.EMPTY_SET);
filter = new CJKBigramFilter(filter);
return new TokenStreamComponents(tokenizer, filter);
}
};
assertAnalyzesTo(analyzer, "一", new String[] { "一" }, new int[] { 0 }, new int[] { 1 }, new String[] { "<SINGLE>" }, new int[] { 1 });
analyzer.close();
}
use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class ShingleAnalyzerWrapperTest method testAltFillerToken.
public void testAltFillerToken() throws Exception {
Analyzer delegate = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
CharArraySet stopSet = StopFilter.makeStopSet("into");
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new StopFilter(tokenizer, stopSet);
return new TokenStreamComponents(tokenizer, filter);
}
};
ShingleAnalyzerWrapper analyzer = new ShingleAnalyzerWrapper(delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, false, "--");
assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please", "please divide", "divide", "divide --", "-- shingles", "shingles" }, new int[] { 0, 0, 7, 7, 19, 19 }, new int[] { 6, 13, 13, 19, 27, 27 }, new int[] { 1, 0, 1, 0, 1, 1 });
analyzer.close();
delegate = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
CharArraySet stopSet = StopFilter.makeStopSet("into");
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new StopFilter(tokenizer, stopSet);
return new TokenStreamComponents(tokenizer, filter);
}
};
analyzer = new ShingleAnalyzerWrapper(delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, null);
assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });
analyzer.close();
delegate = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
CharArraySet stopSet = StopFilter.makeStopSet("into");
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new StopFilter(tokenizer, stopSet);
return new TokenStreamComponents(tokenizer, filter);
}
};
analyzer = new ShingleAnalyzerWrapper(delegate, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, false, false, "");
assertAnalyzesTo(analyzer, "please divide into shingles", new String[] { "please divide", "divide ", " shingles" }, new int[] { 0, 7, 19 }, new int[] { 13, 19, 27 }, new int[] { 1, 1, 1 });
analyzer.close();
}
use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestIndicNormalizer method check.
private void check(String input, String output) throws IOException {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
;
tokenizer.setReader(new StringReader(input));
TokenFilter tf = new IndicNormalizationFilter(tokenizer);
assertTokenStreamContents(tf, new String[] { output });
}
use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestIndexWriterExceptions method testExceptionFromTokenStream.
// LUCENE-1072
public void testExceptionFromTokenStream() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(new Analyzer() {
@Override
public TokenStreamComponents createComponents(String fieldName) {
MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
// disable workflow checking as we forcefully close() in exceptional cases.
tokenizer.setEnableChecks(false);
return new TokenStreamComponents(tokenizer, new TokenFilter(tokenizer) {
private int count = 0;
@Override
public boolean incrementToken() throws IOException {
if (count++ == 5) {
throw new IOException();
}
return input.incrementToken();
}
@Override
public void reset() throws IOException {
super.reset();
this.count = 0;
}
});
}
});
conf.setMaxBufferedDocs(Math.max(3, conf.getMaxBufferedDocs()));
IndexWriter writer = new IndexWriter(dir, conf);
Document brokenDoc = new Document();
String contents = "aa bb cc dd ee ff gg hh ii jj kk";
brokenDoc.add(newTextField("content", contents, Field.Store.NO));
expectThrows(Exception.class, () -> {
writer.addDocument(brokenDoc);
});
// Make sure we can add another normal document
Document doc = new Document();
doc.add(newTextField("content", "aa bb cc dd", Field.Store.NO));
writer.addDocument(doc);
// Make sure we can add another normal document
doc = new Document();
doc.add(newTextField("content", "aa bb cc dd", Field.Store.NO));
writer.addDocument(doc);
writer.close();
IndexReader reader = DirectoryReader.open(dir);
final Term t = new Term("content", "aa");
assertEquals(3, reader.docFreq(t));
// Make sure the doc that hit the exception was marked
// as deleted:
PostingsEnum tdocs = TestUtil.docs(random(), reader, t.field(), new BytesRef(t.text()), null, 0);
final Bits liveDocs = MultiFields.getLiveDocs(reader);
int count = 0;
while (tdocs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (liveDocs == null || liveDocs.get(tdocs.docID())) {
count++;
}
}
assertEquals(2, count);
assertEquals(reader.docFreq(new Term("content", "gg")), 0);
reader.close();
dir.close();
}
use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestHindiNormalizer method check.
private void check(String input, String output) throws IOException {
Tokenizer tokenizer = whitespaceMockTokenizer(input);
TokenFilter tf = new HindiNormalizationFilter(tokenizer);
assertTokenStreamContents(tf, new String[] { output });
}
Aggregations