Search in sources :

Example 1 with TokenFilter

use of org.apache.lucene.analysis.TokenFilter in project elasticsearch by elastic.

the class AbstractTermVectorsTestCase method indexDocsWithLucene.

protected DirectoryReader indexDocsWithLucene(TestDoc[] testDocs) throws IOException {
    Map<String, Analyzer> mapping = new HashMap<>();
    for (TestFieldSetting field : testDocs[0].fieldSettings) {
        if (field.storedPayloads) {
            mapping.put(field.name, new Analyzer() {

                @Override
                protected TokenStreamComponents createComponents(String fieldName) {
                    Tokenizer tokenizer = new StandardTokenizer();
                    TokenFilter filter = new LowerCaseFilter(tokenizer);
                    filter = new TypeAsPayloadTokenFilter(filter);
                    return new TokenStreamComponents(tokenizer, filter);
                }
            });
        }
    }
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), mapping);
    Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(wrapper);
    conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir, conf);
    for (TestDoc doc : testDocs) {
        Document d = new Document();
        d.add(new Field("id", doc.id, StringField.TYPE_STORED));
        for (int i = 0; i < doc.fieldContent.length; i++) {
            FieldType type = new FieldType(TextField.TYPE_STORED);
            TestFieldSetting fieldSetting = doc.fieldSettings[i];
            type.setStoreTermVectorOffsets(fieldSetting.storedOffset);
            type.setStoreTermVectorPayloads(fieldSetting.storedPayloads);
            type.setStoreTermVectorPositions(fieldSetting.storedPositions || fieldSetting.storedPayloads || fieldSetting.storedOffset);
            type.setStoreTermVectors(true);
            type.freeze();
            d.add(new Field(fieldSetting.name, doc.fieldContent[i], type));
        }
        writer.updateDocument(new Term("id", doc.id), d);
        writer.commit();
    }
    writer.close();
    return DirectoryReader.open(dir);
}
Also used : HashMap(java.util.HashMap) TypeAsPayloadTokenFilter(org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter) Term(org.apache.lucene.index.Term) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Document(org.apache.lucene.document.Document) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) FieldType(org.apache.lucene.document.FieldType) StringField(org.apache.lucene.document.StringField) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) TypeAsPayloadTokenFilter(org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter) TokenFilter(org.apache.lucene.analysis.TokenFilter) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 2 with TokenFilter

use of org.apache.lucene.analysis.TokenFilter in project zm-mailbox by Zimbra.

the class ContactTokenFilterTest method contactDataFilter.

@Test
public void contactDataFilter() throws Exception {
    AddrCharTokenizer tokenizer = new AddrCharTokenizer(new StringReader("all-snv"));
    TokenFilter filter = new ContactTokenFilter(tokenizer);
    Assert.assertEquals(Collections.singletonList("all-snv"), ZimbraAnalyzerTest.toTokens(filter));
    tokenizer.reset(new StringReader("."));
    Assert.assertEquals(Collections.EMPTY_LIST, ZimbraAnalyzerTest.toTokens(filter));
    tokenizer.reset(new StringReader(".. ."));
    Assert.assertEquals(Collections.singletonList(".."), ZimbraAnalyzerTest.toTokens(filter));
    tokenizer.reset(new StringReader(".abc"));
    Assert.assertEquals(Collections.singletonList(".abc"), ZimbraAnalyzerTest.toTokens(filter));
    tokenizer.reset(new StringReader("a"));
    Assert.assertEquals(Collections.singletonList("a"), ZimbraAnalyzerTest.toTokens(filter));
    tokenizer.reset(new StringReader("test.com"));
    Assert.assertEquals(Collections.singletonList("test.com"), ZimbraAnalyzerTest.toTokens(filter));
    tokenizer.reset(new StringReader("user1@zim"));
    Assert.assertEquals(Collections.singletonList("user1@zim"), ZimbraAnalyzerTest.toTokens(filter));
    tokenizer.reset(new StringReader("user1@zimbra.com"));
    Assert.assertEquals(Collections.singletonList("user1@zimbra.com"), ZimbraAnalyzerTest.toTokens(filter));
}
Also used : StringReader(java.io.StringReader) TokenFilter(org.apache.lucene.analysis.TokenFilter) Test(org.junit.Test) ZimbraAnalyzerTest(com.zimbra.cs.index.ZimbraAnalyzerTest)

Example 3 with TokenFilter

use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.

the class NGramTokenFilterTest method testInvalidOffsets.

// LUCENE-3642
// EdgeNgram blindly adds term length to offset, but this can take things out of bounds
// wrt original text if a previous filter increases the length of the word (in this case æ -> ae)
// so in this case we behave like WDF, and preserve any modified offsets
public void testInvalidOffsets() throws Exception {
    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
            filters = new NGramTokenFilter(filters, 2, 2);
            return new TokenStreamComponents(tokenizer, filters);
        }
    };
    assertAnalyzesTo(analyzer, "mosfellsbær", new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }, new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 });
    analyzer.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Analyzer(org.apache.lucene.analysis.Analyzer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) TokenFilter(org.apache.lucene.analysis.TokenFilter)

Example 4 with TokenFilter

use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.

the class TestCompoundWordTokenFilter method testRandomStrings.

/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
    final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict));
        }
    };
    checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
    a.close();
    InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
    final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
    Analyzer b = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
    checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER);
    b.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) InputSource(org.xml.sax.InputSource) HyphenationTree(org.apache.lucene.analysis.compound.hyphenation.HyphenationTree) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) TokenFilter(org.apache.lucene.analysis.TokenFilter)

Example 5 with TokenFilter

use of org.apache.lucene.analysis.TokenFilter in project lucene-solr by apache.

the class TestICUTokenizer method setUp.

@Override
public void setUp() throws Exception {
    super.setUp();
    a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new ICUTokenizer(newAttributeFactory(), new DefaultICUTokenizerConfig(false, true));
            TokenFilter filter = new ICUNormalizer2Filter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}
Also used : ICUNormalizer2Filter(org.apache.lucene.analysis.icu.ICUNormalizer2Filter) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) TokenFilter(org.apache.lucene.analysis.TokenFilter)

Aggregations

TokenFilter (org.apache.lucene.analysis.TokenFilter)23 Tokenizer (org.apache.lucene.analysis.Tokenizer)19 Analyzer (org.apache.lucene.analysis.Analyzer)17 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)12 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)9 StringReader (java.io.StringReader)8 CharArraySet (org.apache.lucene.analysis.CharArraySet)6 Document (org.apache.lucene.document.Document)6 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)5 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 HashMap (java.util.HashMap)4 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)4 Field (org.apache.lucene.document.Field)4 TextField (org.apache.lucene.document.TextField)4 IndexWriter (org.apache.lucene.index.IndexWriter)4 Directory (org.apache.lucene.store.Directory)4 RAMDirectory (org.apache.lucene.store.RAMDirectory)4 BytesRef (org.apache.lucene.util.BytesRef)4 IOException (java.io.IOException)3 MockTokenFilter (org.apache.lucene.analysis.MockTokenFilter)3