Search in sources :

Example 11 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class CeliClassificationEnhancementEngineTest method tesetEngine.

@Test
public void tesetEngine() throws Exception {
    ContentItem ci = wrapAsContentItem(TEXT);
    try {
        // add a simple triple to statically define the language of the test
        // content
        ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("fr")));
        // unit test should not depend on each other (if possible)
        // CeliLanguageIdentifierEnhancementEngineTest.addEnanchements(ci);
        classificationEngine.computeEnhancements(ci);
        TestUtils.logEnhancements(ci);
        HashMap<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
        expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
        expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(classificationEngine.getClass().getName()));
        int textAnnoNum = EnhancementStructureHelper.validateAllTextAnnotations(ci.getMetadata(), TEXT, expectedValues);
        assertEquals("Only a single fise:TextAnnotation is expeted", 1, textAnnoNum);
        int numTopicAnnotations = validateAllTopicAnnotations(ci.getMetadata(), expectedValues);
        assertTrue("No TpocisAnnotations found", numTopicAnnotations > 0);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e);
        return;
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) HashMap(java.util.HashMap) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 12 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class CeliAnalyzedTextLemmatizerEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    isLangaugeConfigured(this, languageConfig, language, true);
    List<LexicalEntry> terms;
    try {
        terms = this.client.performMorfologicalAnalysis(at.getSpan(), language);
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
    }
    Map<LexicalCategory, Double> tokenLexCats = new EnumMap<LexicalCategory, Double>(LexicalCategory.class);
    for (LexicalEntry term : terms) {
        if (term.getTermReadings().isEmpty()) {
            // ignore terms without readings
            continue;
        }
        // Add the LexicalEntry as Token to the Text. NOTE that if a
        // Token with the same start/end positions already exist this
        // Method returns the existing instance
        Token token = at.addToken(term.getFrom(), term.getTo());
        // Now try to get POS annotations for the Token
        for (Value<PosTag> posAnno : token.getAnnotations(NlpAnnotations.POS_ANNOTATION)) {
            if (posAnno.value().isMapped()) {
                for (LexicalCategory cat : posAnno.value().getCategories()) {
                    if (!tokenLexCats.containsKey(cat)) {
                        // do not override with lover prob
                        tokenLexCats.put(cat, posAnno.probability());
                    }
                }
            }
        }
        for (Reading reading : term.getTermReadings()) {
            MorphoFeatures mf = CeliMorphoFeatures.parseFrom(reading, language);
            // add the readings (MorphoFeatures)
            if (mf != null) {
                // use the POS tags of the morpho analysis and compare it
                // with existing POS tags.
                double posProbability = -1;
                Set<LexicalCategory> mfCats = EnumSet.noneOf(LexicalCategory.class);
                for (PosTag mfPos : mf.getPosList()) {
                    mfCats.addAll(mfPos.getCategories());
                }
                for (LexicalCategory mfCat : mfCats) {
                    Double prob = tokenLexCats.get(mfCat);
                    if (prob != null && posProbability < prob) {
                        posProbability = prob;
                    }
                }
                // add the morpho features with the posProbabiliy
                Value<MorphoFeatures> value = Value.value(mf, posProbability < 0 ? Value.UNKNOWN_PROBABILITY : posProbability);
                token.addAnnotation(NlpAnnotations.MORPHO_ANNOTATION, value);
            }
        }
    }
}
Also used : EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) IOException(java.io.IOException) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) SOAPException(javax.xml.soap.SOAPException) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) CeliMorphoFeatures(org.apache.stanbol.enhancer.engines.celi.CeliMorphoFeatures) EnumMap(java.util.EnumMap)

Example 13 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class CeliNamedEntityExtractionEnhancementEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text = "";
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    if (text.trim().length() == 0) {
        log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
        return;
    }
    String language = EnhancementEngineHelper.getLanguage(ci);
    if (language == null) {
        throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    // used for the palin literals in TextAnnotations
    Language lang = new Language(language);
    try {
        List<NamedEntity> lista = this.client.extractEntities(text, language);
        LiteralFactory literalFactory = LiteralFactory.getInstance();
        Graph g = ci.getMetadata();
        for (NamedEntity ne : lista) {
            try {
                IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                // add selected text as PlainLiteral in the language extracted from the text
                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(ne.getFormKind(), lang)));
                g.add(new TripleImpl(textAnnotation, DC_TYPE, getEntityRefForType(ne.type)));
                if (ne.getFrom() != null && ne.getTo() != null) {
                    g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(ne.getFrom().intValue())));
                    g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(ne.getTo().intValue())));
                    g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, ne.getFormKind(), ne.getFrom().intValue()), lang)));
                }
            } catch (NoConvertorException e) {
                log.error(e.getMessage(), e);
            }
        }
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI NER (Named Entity Recognition)" + " service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI NER (Named Entity Recognition) service!", e);
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) NoConvertorException(org.apache.clerezza.rdf.core.NoConvertorException) SOAPException(javax.xml.soap.SOAPException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 14 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class NEREngineCore method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    // first check the langauge before processing the content (text)
    String language = extractLanguage(ci);
    if (language == null) {
        throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    if (!isNerModel(language)) {
        throw new IllegalStateException("For the language '" + language + "' of ContentItem " + ci.getUri() + " no NER model is configured: This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    final AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
    // validate data in the AnalysedText
    final String text;
    if (at != null && at.getTokens().hasNext()) {
        // if the AnalysedText is present and tokens are present
        if (log.isDebugEnabled()) {
            log.debug("computeEnhancements from AnalysedText ContentPart of ContentItem {}: text={}", ci.getUri().getUnicodeString(), StringUtils.abbreviate(at.getSpan(), 100));
        }
        text = null;
    } else {
        // no AnalysedText with tokens ...
        // fallback to processing the plain text is still supported
        Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
        if (contentPart == null) {
            throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
        }
        try {
            text = ContentItemHelper.getText(contentPart.getValue());
        } catch (IOException e) {
            throw new InvalidContentException(this, ci, e);
        }
        if (text.trim().length() == 0) {
            // TODO: make the length of the data a field of the ContentItem
            // interface to be able to filter out empty items in the canEnhance
            // method
            log.warn("ContentPart {} of ContentItem {} does not contain any text" + "to extract knowledge from in ContentItem {}", contentPart.getKey(), ci);
            return;
        }
        if (log.isDebugEnabled()) {
            log.debug("computeEnhancements from ContentPart {} of ContentItem {}: text={}", new Object[] { contentPart.getKey(), ci.getUri().getUnicodeString(), StringUtils.abbreviate(text, 100) });
        }
    }
    try {
        if (config.isProcessedLangage(language)) {
            for (String defaultModelType : config.getDefaultModelTypes()) {
                TokenNameFinderModel nameFinderModel = openNLP.getNameModel(defaultModelType, language);
                if (nameFinderModel == null) {
                    log.info("No NER Model for {} and language {} available!", defaultModelType, language);
                } else {
                    findNamedEntities(ci, at, text, language, nameFinderModel);
                }
            }
        }
        // process for additional models
        for (String additionalModel : config.getSpecificNerModles(language)) {
            TokenNameFinderModel nameFinderModel;
            try {
                nameFinderModel = openNLP.getModel(TokenNameFinderModel.class, additionalModel, null);
                findNamedEntities(ci, at, text, language, nameFinderModel);
            } catch (IOException e) {
                log.warn("Unable to load TokenNameFinderModel model for language '" + language + "' (model: " + additionalModel + ")", e);
            } catch (RuntimeException e) {
                log.warn("Error while creating ChunkerModel for language '" + language + "' (model: " + additionalModel + ")", e);
            }
        }
    } catch (Exception e) {
        if (e instanceof RuntimeException) {
            throw (RuntimeException) e;
        } else {
            throw new EngineException(this, ci, e);
        }
    }
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) IRI(org.apache.clerezza.commons.rdf.IRI) TokenNameFinderModel(opennlp.tools.namefind.TokenNameFinderModel) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) InvalidFormatException(opennlp.tools.util.InvalidFormatException) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) IOException(java.io.IOException)

Example 15 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class RestfulLangidentEngine method computeEnhancements.

/**
 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 * <p/>
 * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
 * stores it as a new part in the content item. The metadata is not changed.
 *
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
 */
@Override
public void computeEnhancements(final ContentItem ci) throws EngineException {
    // get the plain text Blob
    Map.Entry<IRI, Blob> textBlob = getPlainText(this, ci, false);
    Blob blob = textBlob.getValue();
    // send the text to the server
    final HttpPost request = new HttpPost(serviceUrl);
    request.setEntity(new InputStreamEntity(blob.getStream(), blob.getContentLength(), ContentType.create(blob.getMimeType(), blob.getParameter().get("charset"))));
    // execute the request
    List<LangSuggestion> detected;
    try {
        detected = AccessController.doPrivileged(new PrivilegedExceptionAction<List<LangSuggestion>>() {

            public List<LangSuggestion> run() throws ClientProtocolException, IOException {
                return httpClient.execute(request, new LangIdentResponseHandler(ci, objectMapper));
            }
        });
    } catch (PrivilegedActionException pae) {
        Exception e = pae.getException();
        if (e instanceof ClientProtocolException) {
            throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful Language Identification Service at " + serviceUrl, e);
        } else if (e instanceof IOException) {
            throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful Language Identification Service at " + serviceUrl, e);
        } else {
            throw RuntimeException.class.cast(e);
        }
    }
    Graph metadata = ci.getMetadata();
    log.debug("Detected Languages for ContentItem {} and Blob {}");
    ci.getLock().writeLock().lock();
    try {
        // write TextAnnotations for the detected languages
        for (LangSuggestion suggestion : detected) {
            // add a hypothesis
            log.debug(" > {}@{}", suggestion.getLanguage(), suggestion.hasProbability() ? suggestion.getProbability() : "-,--");
            IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
            metadata.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(suggestion.getLanguage())));
            metadata.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
            if (suggestion.hasProbability()) {
                metadata.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getProbability())));
            }
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) HttpPost(org.apache.http.client.methods.HttpPost) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PrivilegedActionException(java.security.PrivilegedActionException) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) PrivilegedExceptionAction(java.security.PrivilegedExceptionAction) IOException(java.io.IOException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) URISyntaxException(java.net.URISyntaxException) ConfigurationException(org.osgi.service.cm.ConfigurationException) HttpException(org.apache.http.HttpException) ClientProtocolException(org.apache.http.client.ClientProtocolException) PrivilegedActionException(java.security.PrivilegedActionException) HttpResponseException(org.apache.http.client.HttpResponseException) IOException(java.io.IOException) InputStreamEntity(org.apache.http.entity.InputStreamEntity) ClientProtocolException(org.apache.http.client.ClientProtocolException) Graph(org.apache.clerezza.commons.rdf.Graph) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Map(java.util.Map) HashMap(java.util.HashMap)

Aggregations

EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)55 IRI (org.apache.clerezza.commons.rdf.IRI)37 IOException (java.io.IOException)33 Graph (org.apache.clerezza.commons.rdf.Graph)24 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)23 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)20 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)15 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)15 HashMap (java.util.HashMap)13 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)13 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)12 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)10 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)10 Test (org.junit.Test)10 Triple (org.apache.clerezza.commons.rdf.Triple)9 InputStream (java.io.InputStream)8 SOAPException (javax.xml.soap.SOAPException)8 Token (org.apache.stanbol.enhancer.nlp.model.Token)8 Language (org.apache.clerezza.commons.rdf.Language)7 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)7