Search in sources :

Example 1 with TokenizerChain

use of org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain in project jackrabbit-oak by apache.

the class NodeStateAnalyzerFactory method composeAnalyzer.

private Analyzer composeAnalyzer(NodeState state) {
    TokenizerFactory tf = loadTokenizer(state.getChildNode(LuceneIndexConstants.ANL_TOKENIZER));
    CharFilterFactory[] cfs = loadCharFilterFactories(state.getChildNode(LuceneIndexConstants.ANL_CHAR_FILTERS));
    TokenFilterFactory[] tffs = loadTokenFilterFactories(state.getChildNode(LuceneIndexConstants.ANL_FILTERS));
    return new TokenizerChain(cfs, tf, tffs);
}
Also used : TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) TokenizerChain(org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Example 2 with TokenizerChain

use of org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain in project jackrabbit-oak by apache.

the class IndexDefinition method createAnalyzer.

// ~---------------------------------------------------< Analyzer >
private Analyzer createAnalyzer() {
    Analyzer result;
    Analyzer defaultAnalyzer = LuceneIndexConstants.ANALYZER;
    if (analyzers.containsKey(LuceneIndexConstants.ANL_DEFAULT)) {
        defaultAnalyzer = analyzers.get(LuceneIndexConstants.ANL_DEFAULT);
    }
    if (!evaluatePathRestrictions()) {
        result = defaultAnalyzer;
    } else {
        Map<String, Analyzer> analyzerMap = ImmutableMap.<String, Analyzer>builder().put(FieldNames.ANCESTORS, new TokenizerChain(new PathHierarchyTokenizerFactory(Collections.<String, String>emptyMap()))).build();
        result = new PerFieldAnalyzerWrapper(defaultAnalyzer, analyzerMap);
    }
    // In case of negative value no limits would be applied
    if (maxFieldLength < 0) {
        return result;
    }
    return new LimitTokenCountAnalyzer(result, maxFieldLength);
}
Also used : LimitTokenCountAnalyzer(org.apache.lucene.analysis.miscellaneous.LimitTokenCountAnalyzer) PathHierarchyTokenizerFactory(org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory) TokenizerChain(org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain) LimitTokenCountAnalyzer(org.apache.lucene.analysis.miscellaneous.LimitTokenCountAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)

Example 3 with TokenizerChain

use of org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain in project jackrabbit-oak by apache.

the class NodeStateAnalyzerFactoryTest method analyzerByComposition_TokenFilter.

@Test
public void analyzerByComposition_TokenFilter() throws Exception {
    NodeBuilder nb = EMPTY_NODE.builder();
    nb.child(ANL_TOKENIZER).setProperty(ANL_NAME, "whitespace");
    NodeBuilder filters = nb.child(ANL_FILTERS);
    filters.setProperty(OAK_CHILD_ORDER, ImmutableList.of("stop", "LowerCase"), NAMES);
    filters.child("LowerCase").setProperty(ANL_NAME, "LowerCase");
    filters.child("LowerCase").setProperty(JCR_PRIMARYTYPE, "nt:unstructured");
    // name is optional. Derived from nodeName
    filters.child("stop").setProperty(ANL_LUCENE_MATCH_VERSION, Version.LUCENE_31.toString());
    TokenizerChain analyzer = (TokenizerChain) factory.createInstance(nb.getNodeState());
    assertEquals(2, analyzer.getFilters().length);
    // check the order
    assertEquals(StopFilterFactory.class.getName(), analyzer.getFilters()[0].getClassArg());
    assertEquals(LowerCaseFilterFactory.class.getName(), analyzer.getFilters()[1].getClassArg());
    assertTrue(analyzer.getFilters()[0].isExplicitLuceneMatchVersion());
}
Also used : StopFilterFactory(org.apache.lucene.analysis.core.StopFilterFactory) TokenizerChain(org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain) LowerCaseFilterFactory(org.apache.lucene.analysis.core.LowerCaseFilterFactory) NodeBuilder(org.apache.jackrabbit.oak.spi.state.NodeBuilder) Test(org.junit.Test)

Example 4 with TokenizerChain

use of org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain in project jackrabbit-oak by apache.

the class NodeStateAnalyzerFactoryTest method analyzerByComposition_FileResource.

@Test
public void analyzerByComposition_FileResource() throws Exception {
    NodeBuilder nb = EMPTY_NODE.builder();
    nb.child(ANL_TOKENIZER).setProperty(ANL_NAME, "whitespace");
    NodeBuilder filters = nb.child(ANL_FILTERS);
    // name is optional. Derived from nodeName
    NodeBuilder stop = filters.child("stop");
    stop.setProperty("words", "set1.txt, set2.txt");
    createFileNode(stop, "set1.txt", newCharArraySet("foo", "bar"));
    createFileNode(stop, "set2.txt", newCharArraySet("foo1", "bar1"));
    TokenizerChain analyzer = (TokenizerChain) factory.createInstance(nb.getNodeState());
    assertEquals(1, analyzer.getFilters().length);
    // check the order
    assertEquals(StopFilterFactory.class.getName(), analyzer.getFilters()[0].getClassArg());
    StopFilterFactory sff = (StopFilterFactory) analyzer.getFilters()[0];
    assertTrue(sff.getStopWords().contains("foo"));
    assertTrue(sff.getStopWords().contains("foo1"));
}
Also used : StopFilterFactory(org.apache.lucene.analysis.core.StopFilterFactory) TokenizerChain(org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain) NodeBuilder(org.apache.jackrabbit.oak.spi.state.NodeBuilder) Test(org.junit.Test)

Example 5 with TokenizerChain

use of org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain in project jackrabbit-oak by apache.

the class NodeStateAnalyzerFactoryTest method analyzerByComposition_Tokenizer.

@Test
public void analyzerByComposition_Tokenizer() throws Exception {
    NodeBuilder nb = EMPTY_NODE.builder();
    nb.child(ANL_TOKENIZER).setProperty(ANL_NAME, "whitespace");
    TokenizerChain analyzer = (TokenizerChain) factory.createInstance(nb.getNodeState());
    assertEquals(WhitespaceTokenizerFactory.class.getName(), analyzer.getTokenizer().getClassArg());
    nb.child(ANL_TOKENIZER).setProperty(ANL_NAME, "pathhierarchy").setProperty("delimiter", "#");
    analyzer = (TokenizerChain) factory.createInstance(nb.getNodeState());
    assertEquals(PathHierarchyTokenizerFactory.class.getName(), analyzer.getTokenizer().getClassArg());
    assertEquals('#', getValue(analyzer.getTokenizer(), "delimiter"));
}
Also used : PathHierarchyTokenizerFactory(org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory) TokenizerChain(org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain) NodeBuilder(org.apache.jackrabbit.oak.spi.state.NodeBuilder) WhitespaceTokenizerFactory(org.apache.lucene.analysis.core.WhitespaceTokenizerFactory) Test(org.junit.Test)

Aggregations

TokenizerChain (org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain)6 NodeBuilder (org.apache.jackrabbit.oak.spi.state.NodeBuilder)4 Test (org.junit.Test)4 StopFilterFactory (org.apache.lucene.analysis.core.StopFilterFactory)2 PathHierarchyTokenizerFactory (org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory)2 Analyzer (org.apache.lucene.analysis.Analyzer)1 HTMLStripCharFilterFactory (org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory)1 MappingCharFilterFactory (org.apache.lucene.analysis.charfilter.MappingCharFilterFactory)1 LowerCaseFilterFactory (org.apache.lucene.analysis.core.LowerCaseFilterFactory)1 WhitespaceTokenizerFactory (org.apache.lucene.analysis.core.WhitespaceTokenizerFactory)1 LimitTokenCountAnalyzer (org.apache.lucene.analysis.miscellaneous.LimitTokenCountAnalyzer)1 PerFieldAnalyzerWrapper (org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)1 CharFilterFactory (org.apache.lucene.analysis.util.CharFilterFactory)1 TokenFilterFactory (org.apache.lucene.analysis.util.TokenFilterFactory)1 TokenizerFactory (org.apache.lucene.analysis.util.TokenizerFactory)1