Search in sources :

Example 66 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project jackrabbit-oak by apache.

the class IndexWriterUtils method getIndexWriterConfig.

public static IndexWriterConfig getIndexWriterConfig(IndexDefinition definition, boolean remoteDir) {
    // FIXME: Hack needed to make Lucene work in an OSGi environment
    Thread thread = Thread.currentThread();
    ClassLoader loader = thread.getContextClassLoader();
    thread.setContextClassLoader(IndexWriterConfig.class.getClassLoader());
    try {
        Analyzer definitionAnalyzer = definition.getAnalyzer();
        Map<String, Analyzer> analyzers = new HashMap<String, Analyzer>();
        analyzers.put(FieldNames.SPELLCHECK, new ShingleAnalyzerWrapper(LuceneIndexConstants.ANALYZER, 3));
        if (!definition.isSuggestAnalyzed()) {
            analyzers.put(FieldNames.SUGGEST, SuggestHelper.getAnalyzer());
        }
        Analyzer analyzer = new PerFieldAnalyzerWrapper(definitionAnalyzer, analyzers);
        IndexWriterConfig config = new IndexWriterConfig(VERSION, analyzer);
        if (remoteDir) {
            config.setMergeScheduler(new SerialMergeScheduler());
        }
        if (definition.getCodec() != null) {
            config.setCodec(definition.getCodec());
        }
        return config;
    } finally {
        thread.setContextClassLoader(loader);
    }
}
Also used : SerialMergeScheduler(org.apache.lucene.index.SerialMergeScheduler) HashMap(java.util.HashMap) Analyzer(org.apache.lucene.analysis.Analyzer) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) ShingleAnalyzerWrapper(org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)

Example 67 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project jena by apache.

the class TextIndexLuceneAssembler method open.

/*
    <#index> a :TextIndexLucene ;
        #text:directory "mem" ;
        #text:directory "DIR" ;
        text:directory <file:DIR> ;
        text:entityMap <#endMap> ;
        .
    */
@SuppressWarnings("resource")
@Override
public TextIndex open(Assembler a, Resource root, Mode mode) {
    try {
        if (!GraphUtils.exactlyOneProperty(root, pDirectory))
            throw new TextIndexException("No 'text:directory' property on " + root);
        Directory directory;
        RDFNode n = root.getProperty(pDirectory).getObject();
        if (n.isLiteral()) {
            String literalValue = n.asLiteral().getLexicalForm();
            if (literalValue.equals("mem")) {
                directory = new RAMDirectory();
            } else {
                File dir = new File(literalValue);
                directory = FSDirectory.open(dir.toPath());
            }
        } else {
            Resource x = n.asResource();
            String path = IRILib.IRIToFilename(x.getURI());
            File dir = new File(path);
            directory = FSDirectory.open(dir.toPath());
        }
        Analyzer analyzer = null;
        Statement analyzerStatement = root.getProperty(pAnalyzer);
        if (null != analyzerStatement) {
            RDFNode aNode = analyzerStatement.getObject();
            if (!aNode.isResource()) {
                throw new TextIndexException("Text analyzer property is not a resource : " + aNode);
            }
            Resource analyzerResource = (Resource) aNode;
            analyzer = (Analyzer) a.open(analyzerResource);
        }
        Analyzer queryAnalyzer = null;
        Statement queryAnalyzerStatement = root.getProperty(pQueryAnalyzer);
        if (null != queryAnalyzerStatement) {
            RDFNode qaNode = queryAnalyzerStatement.getObject();
            if (!qaNode.isResource()) {
                throw new TextIndexException("Text query analyzer property is not a resource : " + qaNode);
            }
            Resource analyzerResource = (Resource) qaNode;
            queryAnalyzer = (Analyzer) a.open(analyzerResource);
        }
        String queryParser = null;
        Statement queryParserStatement = root.getProperty(pQueryParser);
        if (null != queryParserStatement) {
            RDFNode qpNode = queryParserStatement.getObject();
            if (!qpNode.isResource()) {
                throw new TextIndexException("Text query parser property is not a resource : " + qpNode);
            }
            Resource parserResource = (Resource) qpNode;
            queryParser = parserResource.getLocalName();
        }
        boolean isMultilingualSupport = false;
        Statement mlSupportStatement = root.getProperty(pMultilingualSupport);
        if (null != mlSupportStatement) {
            RDFNode mlsNode = mlSupportStatement.getObject();
            if (!mlsNode.isLiteral()) {
                throw new TextIndexException("text:multilingualSupport property must be a string : " + mlsNode);
            }
            isMultilingualSupport = mlsNode.asLiteral().getBoolean();
        }
        boolean storeValues = false;
        Statement storeValuesStatement = root.getProperty(pStoreValues);
        if (null != storeValuesStatement) {
            RDFNode svNode = storeValuesStatement.getObject();
            if (!svNode.isLiteral()) {
                throw new TextIndexException("text:storeValues property must be a string : " + svNode);
            }
            storeValues = svNode.asLiteral().getBoolean();
        }
        Resource r = GraphUtils.getResourceValue(root, pEntityMap);
        EntityDefinition docDef = (EntityDefinition) a.open(r);
        TextIndexConfig config = new TextIndexConfig(docDef);
        config.setAnalyzer(analyzer);
        config.setQueryAnalyzer(queryAnalyzer);
        config.setQueryParser(queryParser);
        config.setMultilingualSupport(isMultilingualSupport);
        config.setValueStored(storeValues);
        return TextDatasetFactory.createLuceneIndex(directory, config);
    } catch (IOException e) {
        IO.exception(e);
        return null;
    }
}
Also used : Statement(org.apache.jena.rdf.model.Statement) Resource(org.apache.jena.rdf.model.Resource) IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) RAMDirectory(org.apache.lucene.store.RAMDirectory) File(java.io.File) RDFNode(org.apache.jena.rdf.model.RDFNode) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory)

Example 68 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project jena by apache.

the class Util method getLocalizedAnalyzer.

public static Analyzer getLocalizedAnalyzer(String lang) {
    if (lang == null)
        return null;
    if (cache.containsKey(lang))
        return cache.get(lang);
    try {
        Class<?> className = analyzersClasses.get(lang);
        if (className == null)
            return null;
        Constructor<?> constructor = className.getConstructor();
        Analyzer analyzer = (Analyzer) constructor.newInstance();
        cache.put(lang, analyzer);
        return analyzer;
    } catch (Exception e) {
        e.printStackTrace();
        return null;
    }
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer)

Example 69 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project jackrabbit-oak by apache.

the class IndexDefinition method createAnalyzer.

//~---------------------------------------------------< Analyzer >
private Analyzer createAnalyzer() {
    Analyzer result;
    Analyzer defaultAnalyzer = LuceneIndexConstants.ANALYZER;
    if (analyzers.containsKey(LuceneIndexConstants.ANL_DEFAULT)) {
        defaultAnalyzer = analyzers.get(LuceneIndexConstants.ANL_DEFAULT);
    }
    if (!evaluatePathRestrictions()) {
        result = defaultAnalyzer;
    } else {
        Map<String, Analyzer> analyzerMap = ImmutableMap.<String, Analyzer>builder().put(FieldNames.ANCESTORS, new TokenizerChain(new PathHierarchyTokenizerFactory(Collections.<String, String>emptyMap()))).build();
        result = new PerFieldAnalyzerWrapper(defaultAnalyzer, analyzerMap);
    }
    //In case of negative value no limits would be applied
    if (maxFieldLength < 0) {
        return result;
    }
    return new LimitTokenCountAnalyzer(result, maxFieldLength);
}
Also used : LimitTokenCountAnalyzer(org.apache.lucene.analysis.miscellaneous.LimitTokenCountAnalyzer) PathHierarchyTokenizerFactory(org.apache.lucene.analysis.path.PathHierarchyTokenizerFactory) TokenizerChain(org.apache.jackrabbit.oak.plugins.index.lucene.util.TokenizerChain) LimitTokenCountAnalyzer(org.apache.lucene.analysis.miscellaneous.LimitTokenCountAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)

Example 70 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project jackrabbit-oak by apache.

the class DefaultAnalyzersConfigurationTest method setUp.

@Before
public void setUp() throws Exception {
    this.exactPathAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            return new TokenStreamComponents(source);
        }
    };
    this.parentPathIndexingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            return new TokenStreamComponents(source);
        }
    };
    this.parentPathSearchingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            TokenStream filter = new ReverseStringFilter(Version.LUCENE_47, source);
            filter = new PatternReplaceFilter(filter, Pattern.compile("[^\\/]+\\/"), "", false);
            filter = new ReverseStringFilter(Version.LUCENE_47, filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.directChildrenPathIndexingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            TokenStream filter = new ReverseStringFilter(Version.LUCENE_47, source);
            filter = new LengthFilter(Version.LUCENE_47, filter, 2, Integer.MAX_VALUE);
            filter = new PatternReplaceFilter(filter, Pattern.compile("([^\\/]+)(\\/)"), "$2", false);
            filter = new PatternReplaceFilter(filter, Pattern.compile("(\\/)(.+)"), "$2", false);
            filter = new ReverseStringFilter(Version.LUCENE_47, filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.directChildrenPathSearchingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            return new TokenStreamComponents(source);
        }
    };
    this.allChildrenPathIndexingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new PathHierarchyTokenizer(reader);
            TokenStream filter = new PatternCaptureGroupTokenFilter(source, false, Pattern.compile("((\\/).*)"));
            filter = new RemoveDuplicatesTokenFilter(filter);
            return new TokenStreamComponents(source, filter);
        }
    };
    this.allChildrenPathSearchingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
            Tokenizer source = new KeywordTokenizer(reader);
            return new TokenStreamComponents(source);
        }
    };
}
Also used : RemoveDuplicatesTokenFilter(org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter) TokenStream(org.apache.lucene.analysis.TokenStream) PathHierarchyTokenizer(org.apache.lucene.analysis.path.PathHierarchyTokenizer) Reader(java.io.Reader) StringReader(java.io.StringReader) Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) PatternCaptureGroupTokenFilter(org.apache.lucene.analysis.pattern.PatternCaptureGroupTokenFilter) LengthFilter(org.apache.lucene.analysis.miscellaneous.LengthFilter) ReverseStringFilter(org.apache.lucene.analysis.reverse.ReverseStringFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) PathHierarchyTokenizer(org.apache.lucene.analysis.path.PathHierarchyTokenizer) PatternReplaceFilter(org.apache.lucene.analysis.pattern.PatternReplaceFilter) Before(org.junit.Before)

Aggregations

Analyzer (org.apache.lucene.analysis.Analyzer)1020 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)396 Tokenizer (org.apache.lucene.analysis.Tokenizer)265 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)228 Document (org.apache.lucene.document.Document)207 Directory (org.apache.lucene.store.Directory)192 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)176 BytesRef (org.apache.lucene.util.BytesRef)122 Test (org.junit.Test)119 TokenStream (org.apache.lucene.analysis.TokenStream)107 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)92 Term (org.apache.lucene.index.Term)92 IndexReader (org.apache.lucene.index.IndexReader)67 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)65 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)64 Input (org.apache.lucene.search.suggest.Input)63 CharArraySet (org.apache.lucene.analysis.CharArraySet)58 ArrayList (java.util.ArrayList)57 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)57 TextField (org.apache.lucene.document.TextField)55