Search in sources :

Example 36 with TokenFilterFactory

use of org.apache.lucene.analysis.util.TokenFilterFactory in project lucene-solr by apache.

the class PayloadUtils method getPayloadEncoder.

public static String getPayloadEncoder(FieldType fieldType) {
    // TODO: support custom payload encoding fields too somehow - maybe someone has a custom component that encodes payloads as floats
    String encoder = null;
    Analyzer a = fieldType.getIndexAnalyzer();
    if (a instanceof TokenizerChain) {
        // examine the indexing analysis chain for DelimitedPayloadTokenFilterFactory or NumericPayloadTokenFilterFactory
        TokenizerChain tc = (TokenizerChain) a;
        TokenFilterFactory[] factories = tc.getTokenFilterFactories();
        for (TokenFilterFactory factory : factories) {
            if (factory instanceof DelimitedPayloadTokenFilterFactory) {
                encoder = factory.getOriginalArgs().get(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR);
                break;
            }
            if (factory instanceof NumericPayloadTokenFilterFactory) {
                // encodes using `PayloadHelper.encodeFloat(payload)`
                encoder = "float";
                break;
            }
        }
    }
    return encoder;
}
Also used : TokenizerChain(org.apache.solr.analysis.TokenizerChain) DelimitedPayloadTokenFilterFactory(org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory) Analyzer(org.apache.lucene.analysis.Analyzer) NumericPayloadTokenFilterFactory(org.apache.lucene.analysis.payloads.NumericPayloadTokenFilterFactory) DelimitedPayloadTokenFilterFactory(org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory) NumericPayloadTokenFilterFactory(org.apache.lucene.analysis.payloads.NumericPayloadTokenFilterFactory) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Example 37 with TokenFilterFactory

use of org.apache.lucene.analysis.util.TokenFilterFactory in project lucene-solr by apache.

the class AnalyzerFactory method toString.

@Override
public String toString() {
    StringBuilder sb = new StringBuilder("AnalyzerFactory(");
    if (null != name) {
        sb.append("name:");
        sb.append(name);
        sb.append(", ");
    }
    if (null != positionIncrementGap) {
        sb.append("positionIncrementGap:");
        sb.append(positionIncrementGap);
        sb.append(", ");
    }
    if (null != offsetGap) {
        sb.append("offsetGap:");
        sb.append(offsetGap);
        sb.append(", ");
    }
    for (CharFilterFactory charFilterFactory : charFilterFactories) {
        sb.append(charFilterFactory);
        sb.append(", ");
    }
    sb.append(tokenizerFactory);
    for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
        sb.append(", ");
        sb.append(tokenFilterFactory);
    }
    sb.append(')');
    return sb.toString();
}
Also used : CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Example 38 with TokenFilterFactory

use of org.apache.lucene.analysis.util.TokenFilterFactory in project lucene-solr by apache.

the class TestAllAnalyzersHaveFactories method test.

public void test() throws Exception {
    List<Class<?>> analysisClasses = TestRandomChains.getClassesForPackage("org.apache.lucene.analysis");
    for (final Class<?> c : analysisClasses) {
        final int modifiers = c.getModifiers();
        if (// don't waste time with abstract classes
        Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers) || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface() || testComponents.contains(c) || crazyComponents.contains(c) || oddlyNamedComponents.contains(c) || tokenFiltersWithoutFactory.contains(c) || // deprecated ones are typically back compat hacks
        c.isAnnotationPresent(Deprecated.class) || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))) {
            continue;
        }
        Map<String, String> args = new HashMap<>();
        args.put("luceneMatchVersion", Version.LATEST.toString());
        if (Tokenizer.class.isAssignableFrom(c)) {
            String clazzName = c.getSimpleName();
            assertTrue(clazzName.endsWith("Tokenizer"));
            String simpleName = clazzName.substring(0, clazzName.length() - 9);
            assertNotNull(TokenizerFactory.lookupClass(simpleName));
            TokenizerFactory instance = null;
            try {
                instance = TokenizerFactory.forName(simpleName, args);
                assertNotNull(instance);
                if (instance instanceof ResourceLoaderAware) {
                    ((ResourceLoaderAware) instance).inform(loader);
                }
                assertSame(c, instance.create().getClass());
            } catch (IllegalArgumentException e) {
            // TODO: For now pass because some factories have not yet a default config that always works
            }
        } else if (TokenFilter.class.isAssignableFrom(c)) {
            String clazzName = c.getSimpleName();
            assertTrue(clazzName.endsWith("Filter"));
            String simpleName = clazzName.substring(0, clazzName.length() - (clazzName.endsWith("TokenFilter") ? 11 : 6));
            assertNotNull(TokenFilterFactory.lookupClass(simpleName));
            TokenFilterFactory instance = null;
            try {
                instance = TokenFilterFactory.forName(simpleName, args);
                assertNotNull(instance);
                if (instance instanceof ResourceLoaderAware) {
                    ((ResourceLoaderAware) instance).inform(loader);
                }
                Class<? extends TokenStream> createdClazz = instance.create(new KeywordTokenizer()).getClass();
                // only check instance if factory have wrapped at all!
                if (KeywordTokenizer.class != createdClazz) {
                    assertSame(c, createdClazz);
                }
            } catch (IllegalArgumentException e) {
            // TODO: For now pass because some factories have not yet a default config that always works
            }
        } else if (CharFilter.class.isAssignableFrom(c)) {
            String clazzName = c.getSimpleName();
            assertTrue(clazzName.endsWith("CharFilter"));
            String simpleName = clazzName.substring(0, clazzName.length() - 10);
            assertNotNull(CharFilterFactory.lookupClass(simpleName));
            CharFilterFactory instance = null;
            try {
                instance = CharFilterFactory.forName(simpleName, args);
                assertNotNull(instance);
                if (instance instanceof ResourceLoaderAware) {
                    ((ResourceLoaderAware) instance).inform(loader);
                }
                Class<? extends Reader> createdClazz = instance.create(new StringReader("")).getClass();
                // only check instance if factory have wrapped at all!
                if (StringReader.class != createdClazz) {
                    assertSame(c, createdClazz);
                }
            } catch (IllegalArgumentException e) {
            // TODO: For now pass because some factories have not yet a default config that always works
            }
        }
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) HashMap(java.util.HashMap) IdentityHashMap(java.util.IdentityHashMap) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) StringReader(java.io.StringReader) ResourceLoaderAware(org.apache.lucene.analysis.util.ResourceLoaderAware) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) ReversePathHierarchyTokenizer(org.apache.lucene.analysis.path.ReversePathHierarchyTokenizer) TeeSinkTokenFilter(org.apache.lucene.analysis.sinks.TeeSinkTokenFilter) CachingTokenFilter(org.apache.lucene.analysis.CachingTokenFilter) MockGraphTokenFilter(org.apache.lucene.analysis.MockGraphTokenFilter) ValidatingTokenFilter(org.apache.lucene.analysis.ValidatingTokenFilter) MockRandomLookaheadTokenFilter(org.apache.lucene.analysis.MockRandomLookaheadTokenFilter) CrankyTokenFilter(org.apache.lucene.analysis.CrankyTokenFilter) MockTokenFilter(org.apache.lucene.analysis.MockTokenFilter) TokenFilter(org.apache.lucene.analysis.TokenFilter) MockHoleInjectingTokenFilter(org.apache.lucene.analysis.MockHoleInjectingTokenFilter)

Example 39 with TokenFilterFactory

use of org.apache.lucene.analysis.util.TokenFilterFactory in project tika by apache.

the class AnalyzerDeserializer method buildAnalyzer.

public static Analyzer buildAnalyzer(String analyzerName, JsonElement value, int maxTokens) throws IOException {
    if (!value.isJsonObject()) {
        throw new IllegalArgumentException("Expecting map of charfilter, tokenizer, tokenfilters");
    }
    JsonObject aRoot = (JsonObject) value;
    CharFilterFactory[] charFilters = new CharFilterFactory[0];
    TokenizerFactory tokenizerFactory = null;
    TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
    for (Map.Entry<String, JsonElement> e : aRoot.entrySet()) {
        String k = e.getKey();
        if (k.equals(CHAR_FILTERS)) {
            charFilters = buildCharFilters(e.getValue(), analyzerName);
        } else if (k.equals(TOKEN_FILTERS)) {
            tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName, maxTokens);
        } else if (k.equals(TOKENIZER)) {
            tokenizerFactory = buildTokenizerFactory(e.getValue(), analyzerName);
        } else if (!k.equals(COMMENT)) {
            throw new IllegalArgumentException("Should have one of three values here:" + CHAR_FILTERS + ", " + TOKENIZER + ", " + TOKEN_FILTERS + ". I don't recognize: " + k);
        }
    }
    if (tokenizerFactory == null) {
        throw new IllegalArgumentException("Must specify at least a tokenizer factory for an analyzer!");
    }
    return new MyTokenizerChain(charFilters, tokenizerFactory, tokenFilterFactories);
}
Also used : TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) JsonElement(com.google.gson.JsonElement) CharFilterFactory(org.apache.lucene.analysis.util.CharFilterFactory) JsonObject(com.google.gson.JsonObject) HashMap(java.util.HashMap) Map(java.util.Map) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory)

Example 40 with TokenFilterFactory

use of org.apache.lucene.analysis.util.TokenFilterFactory in project stanbol by apache.

the class KuromojiNlpEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     * <p/>
     * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
     * stores it as a new part in the content item. The metadata is not changed.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, false);
    if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
        throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
    }
    //start with the Tokenizer
    TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
    //build the analyzing chain by adding all TokenFilters
    for (TokenFilterFactory filterFactory : filterFactories) {
        tokenStream = filterFactory.create(tokenStream);
    }
    //Try to extract sentences based on POS tags ...
    int sentStartOffset = -1;
    //NER data
    List<NerData> nerList = new ArrayList<NerData>();
    //the next index where the NerData.context need to be set
    int nerSentIndex = 0;
    NerData ner = null;
    OffsetAttribute offset = null;
    try {
        //required with Solr 4
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            offset = tokenStream.addAttribute(OffsetAttribute.class);
            Token token = at.addToken(offset.startOffset(), offset.endOffset());
            //Get the POS attribute and init the PosTag
            PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
            PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
            if (posTag == null) {
                posTag = adhocTags.get(posAttr.getPartOfSpeech());
                if (posTag == null) {
                    posTag = new PosTag(posAttr.getPartOfSpeech());
                    adhocTags.put(posAttr.getPartOfSpeech(), posTag);
                    log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
                }
            }
            //Sentence detection by POS tag
            if (sentStartOffset < 0) {
                //the last token was a sentence ending
                sentStartOffset = offset.startOffset();
            }
            if (posTag.hasPos(Pos.Point)) {
                Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
                //add the sentence as context to the NerData instances
                while (nerSentIndex < nerList.size()) {
                    nerList.get(nerSentIndex).context = sent.getSpan();
                    nerSentIndex++;
                }
                sentStartOffset = -1;
            }
            //POS
            token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
            //NER
            NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
            if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
                //write NER annotation
                Chunk chunk = at.addChunk(ner.start, ner.end);
                chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
                //NOTE that the fise:TextAnnotation are written later based on the nerList
                //clean up
                ner = null;
            }
            if (nerTag != null) {
                if (ner == null) {
                    ner = new NerData(nerTag, offset.startOffset());
                    nerList.add(ner);
                }
                ner.end = offset.endOffset();
            }
            BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
            MorphoFeatures morpho = null;
            if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
                morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
                //and add the posTag
                morpho.addPos(posTag);
            }
            InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
            inflectionAttr.getInflectionForm();
            inflectionAttr.getInflectionType();
            if (morpho != null) {
                //if present add the morpho
                token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
            }
        }
        //we still need to write the last sentence
        Sentence lastSent = null;
        if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
            lastSent = at.addSentence(sentStartOffset, offset.endOffset());
        }
        //and set the context off remaining named entities
        while (nerSentIndex < nerList.size()) {
            if (lastSent != null) {
                nerList.get(nerSentIndex).context = lastSent.getSpan();
            } else {
                //no sentence detected
                nerList.get(nerSentIndex).context = at.getSpan();
            }
            nerSentIndex++;
        }
    } catch (IOException e) {
        throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
    } finally {
        try {
            tokenStream.close();
        } catch (IOException e) {
        /* ignore */
        }
    }
    //finally write the NER annotations to the metadata of the ContentItem
    final Graph metadata = ci.getMetadata();
    ci.getLock().writeLock().lock();
    try {
        Language lang = new Language("ja");
        for (NerData nerData : nerList) {
            IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
            metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
            metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
            metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
            metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
            metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) IRI(org.apache.clerezza.commons.rdf.IRI) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) BaseFormAttribute(org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) InflectionAttribute(org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) PartOfSpeechAttribute(org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute) IOException(java.io.IOException) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) Graph(org.apache.clerezza.commons.rdf.Graph) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Aggregations

TokenFilterFactory (org.apache.lucene.analysis.util.TokenFilterFactory)40 CharFilterFactory (org.apache.lucene.analysis.util.CharFilterFactory)16 Analyzer (org.apache.lucene.analysis.Analyzer)12 TokenizerChain (org.apache.solr.analysis.TokenizerChain)12 TokenizerFactory (org.apache.lucene.analysis.util.TokenizerFactory)11 TokenStream (org.apache.lucene.analysis.TokenStream)10 ArrayList (java.util.ArrayList)7 HashMap (java.util.HashMap)7 Tokenizer (org.apache.lucene.analysis.Tokenizer)6 MultiTermAwareComponent (org.apache.lucene.analysis.util.MultiTermAwareComponent)5 IOException (java.io.IOException)4 StringReader (java.io.StringReader)4 Test (org.junit.Test)4 Reader (java.io.Reader)3 Map (java.util.Map)3 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)3 KeywordTokenizerFactory (org.apache.lucene.analysis.core.KeywordTokenizerFactory)3 ResourceLoaderAware (org.apache.lucene.analysis.util.ResourceLoaderAware)3 SolrException (org.apache.solr.common.SolrException)3 JsonElement (com.google.gson.JsonElement)2