Search in sources :

Example 6 with InvalidContentException

use of org.apache.stanbol.enhancer.servicesapi.InvalidContentException in project stanbol by apache.

the class CeliNamedEntityExtractionEnhancementEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text = "";
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    if (text.trim().length() == 0) {
        log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
        return;
    }
    String language = EnhancementEngineHelper.getLanguage(ci);
    if (language == null) {
        throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    // used for the palin literals in TextAnnotations
    Language lang = new Language(language);
    try {
        List<NamedEntity> lista = this.client.extractEntities(text, language);
        LiteralFactory literalFactory = LiteralFactory.getInstance();
        Graph g = ci.getMetadata();
        for (NamedEntity ne : lista) {
            try {
                IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                // add selected text as PlainLiteral in the language extracted from the text
                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(ne.getFormKind(), lang)));
                g.add(new TripleImpl(textAnnotation, DC_TYPE, getEntityRefForType(ne.type)));
                if (ne.getFrom() != null && ne.getTo() != null) {
                    g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(ne.getFrom().intValue())));
                    g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(ne.getTo().intValue())));
                    g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, ne.getFormKind(), ne.getFrom().intValue()), lang)));
                }
            } catch (NoConvertorException e) {
                log.error(e.getMessage(), e);
            }
        }
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI NER (Named Entity Recognition)" + " service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI NER (Named Entity Recognition) service!", e);
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) NoConvertorException(org.apache.clerezza.rdf.core.NoConvertorException) SOAPException(javax.xml.soap.SOAPException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 7 with InvalidContentException

use of org.apache.stanbol.enhancer.servicesapi.InvalidContentException in project stanbol by apache.

the class CeliLanguageIdentifierEnhancementEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text = "";
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    if (text.trim().length() == 0) {
        log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
        return;
    }
    try {
        String[] tmps = text.split(" ");
        List<GuessedLanguage> lista = null;
        if (tmps.length > 5)
            lista = this.client.guessLanguage(text);
        else
            lista = this.client.guessQueryLanguage(text);
        Graph g = ci.getMetadata();
        // in ENHANCE_ASYNC we need to use read/write locks on the ContentItem
        ci.getLock().writeLock().lock();
        try {
            GuessedLanguage gl = lista.get(0);
            IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
            g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(gl.getLang())));
            g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(gl.getConfidence())));
            g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
        } finally {
            ci.getLock().writeLock().unlock();
        }
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI language" + " identifier service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI language identifier service!", e);
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) Graph(org.apache.clerezza.commons.rdf.Graph) SOAPException(javax.xml.soap.SOAPException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 8 with InvalidContentException

use of org.apache.stanbol.enhancer.servicesapi.InvalidContentException in project stanbol by apache.

the class ZemantaEnhancementEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
    }
    String text;
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    if (text.trim().length() == 0) {
        log.warn("ContentPart {} of ContentItem {} does not contain any text to enhance", contentPart.getKey(), ci.getUri());
        return;
    }
    Graph graph = ci.getMetadata();
    IRI ciId = ci.getUri();
    // we need to store the results of Zemanta in an temp graph
    Graph results = new SimpleGraph();
    ZemantaAPIWrapper zemanta = new ZemantaAPIWrapper(key);
    try {
        results.addAll(zemanta.enhance(text));
    } catch (IOException e) {
        throw new EngineException("Unable to get Enhancement from remote Zemanta Service", e);
    }
    // now we need to process the results and convert them into the Enhancer
    // annotation structure
    ci.getLock().writeLock().lock();
    try {
        processRecognition(results, graph, text, ciId);
        processCategories(results, graph, ciId);
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException)

Example 9 with InvalidContentException

use of org.apache.stanbol.enhancer.servicesapi.InvalidContentException in project stanbol by apache.

the class NEREngineCore method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    // first check the langauge before processing the content (text)
    String language = extractLanguage(ci);
    if (language == null) {
        throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    if (!isNerModel(language)) {
        throw new IllegalStateException("For the language '" + language + "' of ContentItem " + ci.getUri() + " no NER model is configured: This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    final AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
    // validate data in the AnalysedText
    final String text;
    if (at != null && at.getTokens().hasNext()) {
        // if the AnalysedText is present and tokens are present
        if (log.isDebugEnabled()) {
            log.debug("computeEnhancements from AnalysedText ContentPart of ContentItem {}: text={}", ci.getUri().getUnicodeString(), StringUtils.abbreviate(at.getSpan(), 100));
        }
        text = null;
    } else {
        // no AnalysedText with tokens ...
        // fallback to processing the plain text is still supported
        Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
        if (contentPart == null) {
            throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
        }
        try {
            text = ContentItemHelper.getText(contentPart.getValue());
        } catch (IOException e) {
            throw new InvalidContentException(this, ci, e);
        }
        if (text.trim().length() == 0) {
            // TODO: make the length of the data a field of the ContentItem
            // interface to be able to filter out empty items in the canEnhance
            // method
            log.warn("ContentPart {} of ContentItem {} does not contain any text" + "to extract knowledge from in ContentItem {}", contentPart.getKey(), ci);
            return;
        }
        if (log.isDebugEnabled()) {
            log.debug("computeEnhancements from ContentPart {} of ContentItem {}: text={}", new Object[] { contentPart.getKey(), ci.getUri().getUnicodeString(), StringUtils.abbreviate(text, 100) });
        }
    }
    try {
        if (config.isProcessedLangage(language)) {
            for (String defaultModelType : config.getDefaultModelTypes()) {
                TokenNameFinderModel nameFinderModel = openNLP.getNameModel(defaultModelType, language);
                if (nameFinderModel == null) {
                    log.info("No NER Model for {} and language {} available!", defaultModelType, language);
                } else {
                    findNamedEntities(ci, at, text, language, nameFinderModel);
                }
            }
        }
        // process for additional models
        for (String additionalModel : config.getSpecificNerModles(language)) {
            TokenNameFinderModel nameFinderModel;
            try {
                nameFinderModel = openNLP.getModel(TokenNameFinderModel.class, additionalModel, null);
                findNamedEntities(ci, at, text, language, nameFinderModel);
            } catch (IOException e) {
                log.warn("Unable to load TokenNameFinderModel model for language '" + language + "' (model: " + additionalModel + ")", e);
            } catch (RuntimeException e) {
                log.warn("Error while creating ChunkerModel for language '" + language + "' (model: " + additionalModel + ")", e);
            }
        }
    } catch (Exception e) {
        if (e instanceof RuntimeException) {
            throw (RuntimeException) e;
        } else {
            throw new EngineException(this, ci, e);
        }
    }
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) IRI(org.apache.clerezza.commons.rdf.IRI) TokenNameFinderModel(opennlp.tools.namefind.TokenNameFinderModel) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) InvalidFormatException(opennlp.tools.util.InvalidFormatException) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) IOException(java.io.IOException)

Example 10 with InvalidContentException

use of org.apache.stanbol.enhancer.servicesapi.InvalidContentException in project stanbol by apache.

the class CeliLemmatizerEnhancementEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    String language = EnhancementEngineHelper.getLanguage(ci);
    if (!isLangSupported(language)) {
        throw new IllegalStateException("Call to computeEnhancement with unsupported language '" + language + " for ContentItem " + ci.getUri() + ": This is also checked " + "in the canEnhance method! -> This indicated an Bug in the " + "implementation of the " + "EnhancementJobManager!");
    }
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text;
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    if (text.trim().length() == 0) {
        log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
        return;
    }
    Graph graph = ci.getMetadata();
    if (this.completeMorphoAnalysis) {
        this.addMorphoAnalysisEnhancement(ci, text, language, graph);
    } else {
        this.addLemmatizationEnhancement(ci, text, language, graph);
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) Graph(org.apache.clerezza.commons.rdf.Graph) IOException(java.io.IOException)

Aggregations

IOException (java.io.IOException)14 IRI (org.apache.clerezza.commons.rdf.IRI)14 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)14 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)14 Graph (org.apache.clerezza.commons.rdf.Graph)10 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)10 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)7 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)7 SOAPException (javax.xml.soap.SOAPException)4 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)3 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)2 ImmutableGraph (org.apache.clerezza.commons.rdf.ImmutableGraph)2 Language (org.apache.clerezza.commons.rdf.Language)2 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)2 NoConvertorException (org.apache.clerezza.rdf.core.NoConvertorException)2 FeatureStructure (org.apache.stanbol.commons.caslight.FeatureStructure)2 FeatureStructureListHolder (org.apache.stanbol.commons.caslight.FeatureStructureListHolder)2 NoSuchPartException (org.apache.stanbol.enhancer.servicesapi.NoSuchPartException)2 LangDetectException (com.cybozu.labs.langdetect.LangDetectException)1 Language (com.cybozu.labs.langdetect.Language)1