use of org.apache.lucene.analysis.TokenFilter in project elasticsearch by elastic.
the class AbstractTermVectorsTestCase method indexDocsWithLucene.
protected DirectoryReader indexDocsWithLucene(TestDoc[] testDocs) throws IOException {
Map<String, Analyzer> mapping = new HashMap<>();
for (TestFieldSetting field : testDocs[0].fieldSettings) {
if (field.storedPayloads) {
mapping.put(field.name, new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new StandardTokenizer();
TokenFilter filter = new LowerCaseFilter(tokenizer);
filter = new TypeAsPayloadTokenFilter(filter);
return new TokenStreamComponents(tokenizer, filter);
}
});
}
}
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), mapping);
Directory dir = new RAMDirectory();
IndexWriterConfig conf = new IndexWriterConfig(wrapper);
conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
for (TestDoc doc : testDocs) {
Document d = new Document();
d.add(new Field("id", doc.id, StringField.TYPE_STORED));
for (int i = 0; i < doc.fieldContent.length; i++) {
FieldType type = new FieldType(TextField.TYPE_STORED);
TestFieldSetting fieldSetting = doc.fieldSettings[i];
type.setStoreTermVectorOffsets(fieldSetting.storedOffset);
type.setStoreTermVectorPayloads(fieldSetting.storedPayloads);
type.setStoreTermVectorPositions(fieldSetting.storedPositions || fieldSetting.storedPayloads || fieldSetting.storedOffset);
type.setStoreTermVectors(true);
type.freeze();
d.add(new Field(fieldSetting.name, doc.fieldContent[i], type));
}
writer.updateDocument(new Term("id", doc.id), d);
writer.commit();
}
writer.close();
return DirectoryReader.open(dir);
}
use of org.apache.lucene.analysis.TokenFilter in project zm-mailbox by Zimbra.
the class ContactTokenFilterTest method contactDataFilter.
@Test
public void contactDataFilter() throws Exception {
AddrCharTokenizer tokenizer = new AddrCharTokenizer(new StringReader("all-snv"));
TokenFilter filter = new ContactTokenFilter(tokenizer);
Assert.assertEquals(Collections.singletonList("all-snv"), ZimbraAnalyzerTest.toTokens(filter));
tokenizer.reset(new StringReader("."));
Assert.assertEquals(Collections.EMPTY_LIST, ZimbraAnalyzerTest.toTokens(filter));
tokenizer.reset(new StringReader(".. ."));
Assert.assertEquals(Collections.singletonList(".."), ZimbraAnalyzerTest.toTokens(filter));
tokenizer.reset(new StringReader(".abc"));
Assert.assertEquals(Collections.singletonList(".abc"), ZimbraAnalyzerTest.toTokens(filter));
tokenizer.reset(new StringReader("a"));
Assert.assertEquals(Collections.singletonList("a"), ZimbraAnalyzerTest.toTokens(filter));
tokenizer.reset(new StringReader("test.com"));
Assert.assertEquals(Collections.singletonList("test.com"), ZimbraAnalyzerTest.toTokens(filter));
tokenizer.reset(new StringReader("user1@zim"));
Assert.assertEquals(Collections.singletonList("user1@zim"), ZimbraAnalyzerTest.toTokens(filter));
tokenizer.reset(new StringReader("user1@zimbra.com"));
Assert.assertEquals(Collections.singletonList("user1@zimbra.com"), ZimbraAnalyzerTest.toTokens(filter));
}
use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class NGramTokenFilterTest method testInvalidOffsets.
// LUCENE-3642
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
// so in this case we behave like WDF, and preserve any modified offsets
public void testInvalidOffsets() throws Exception {
Analyzer analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
filters = new NGramTokenFilter(filters, 2, 2);
return new TokenStreamComponents(tokenizer, filters);
}
};
assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 });
analyzer.close();
}
use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestCompoundWordTokenFilter method testRandomStrings.
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict));
}
};
checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
a.close();
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator);
return new TokenStreamComponents(tokenizer, filter);
}
};
checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER);
b.close();
}
use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.
the class TestICUTokenizer method setUp.
@Override
public void setUp() throws Exception {
super.setUp();
a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
return new TokenStreamComponents(tokenizer, filter);
}
};
}
Aggregations