Search in sources :

Example 16 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class EntityLinkingEngine method writeEnhancements.

/**
 * Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
 * extracted from the parsed ContentItem
 * @param ci
 * @param linkedEntities
 * @param language
 */
private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language, boolean writeRankings) {
    Language languageObject = null;
    if (language != null && !language.isEmpty()) {
        languageObject = new Language(language);
    }
    Set<IRI> dereferencedEntitis = new HashSet<IRI>();
    Graph metadata = ci.getMetadata();
    for (LinkedEntity linkedEntity : linkedEntities) {
        Collection<IRI> textAnnotations = new ArrayList<IRI>(linkedEntity.getOccurrences().size());
        // first create the TextAnnotations for the Occurrences
        for (Occurrence occurrence : linkedEntity.getOccurrences()) {
            Literal startLiteral = literalFactory.createTypedLiteral(occurrence.getStart());
            Literal endLiteral = literalFactory.createTypedLiteral(occurrence.getEnd());
            // search for existing text annotation
            Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
            IRI textAnnotation = null;
            while (it.hasNext()) {
                Triple t = it.next();
                if (metadata.filter(t.getSubject(), ENHANCER_END, endLiteral).hasNext() && metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()) {
                    textAnnotation = (IRI) t.getSubject();
                    break;
                }
            }
            if (textAnnotation == null) {
                // not found ... create a new one
                textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
                metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
                metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.getContext(), languageObject)));
                metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occurrence.getSelectedText(), languageObject)));
                metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(linkedEntity.getScore())));
            } else {
                // if existing add this engine as contributor
                metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
            }
            // add dc:types (even to existing)
            for (IRI dcType : linkedEntity.getTypes()) {
                metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
            }
            textAnnotations.add(textAnnotation);
        }
        // now the EntityAnnotations for the Suggestions
        for (Suggestion suggestion : linkedEntity.getSuggestions()) {
            IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            // should we use the label used for the match, or search the
            // representation for the best label ... currently its the matched one
            Literal label = suggestion.getBestLabel(linkerConfig.getNameField(), language);
            Entity entity = suggestion.getEntity();
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, label));
            metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, entity.getUri()));
            Iterator<IRI> suggestionTypes = entity.getReferences(linkerConfig.getTypeField());
            while (suggestionTypes.hasNext()) {
                metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, suggestionTypes.next()));
            }
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
            for (IRI textAnnotation : textAnnotations) {
                metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
            }
            // add origin information of the EntiySearcher
            for (Entry<IRI, Collection<RDFTerm>> originInfo : entitySearcher.getOriginInformation().entrySet()) {
                for (RDFTerm value : originInfo.getValue()) {
                    metadata.add(new TripleImpl(entityAnnotation, originInfo.getKey(), value));
                }
            }
            if (writeRankings) {
                Float ranking = suggestion.getEntity().getEntityRanking();
                if (ranking != null) {
                    metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_RANKING, // write the float as double
                    new TypedLiteralImpl(ranking.toString(), XSD_DOUBLE)));
                }
            }
            // add the RDF data for entities
            if (linkerConfig.isDereferenceEntitiesEnabled() && dereferencedEntitis.add(entity.getUri())) {
                // NOTE: do not add all triples as there might be other data in the graph
                for (Iterator<Triple> triples = entity.getData().filter(entity.getUri(), null, null); triples.hasNext(); metadata.add(triples.next())) ;
            }
        }
    }
}
Also used : LinkedEntity(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity) IRI(org.apache.clerezza.commons.rdf.IRI) LinkedEntity(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity) Entity(org.apache.stanbol.enhancer.engines.entitylinking.Entity) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) ArrayList(java.util.ArrayList) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) TypedLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.TypedLiteralImpl) Triple(org.apache.clerezza.commons.rdf.Triple) Suggestion(org.apache.stanbol.enhancer.engines.entitylinking.impl.Suggestion) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) Literal(org.apache.clerezza.commons.rdf.Literal) Collection(java.util.Collection) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Occurrence(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity.Occurrence) HashSet(java.util.HashSet)

Example 17 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class DBPSpotlightCandidatesEnhancementEngine method computeEnhancements.

/**
 * Calculate the enhancements by doing a POST request to the DBpedia
 * Spotlight endpoint and processing the results
 *
 * @param ci
 *            the {@link ContentItem}
 */
public void computeEnhancements(ContentItem ci) throws EngineException {
    Language language = SpotlightEngineUtils.getContentLanguage(ci);
    String text = SpotlightEngineUtils.getPlainContent(ci);
    Collection<SurfaceForm> dbpslGraph = doPostRequest(text, ci.getUri());
    if (dbpslGraph != null) {
        // Acquire a write lock on the ContentItem when adding the
        // enhancements
        ci.getLock().writeLock().lock();
        try {
            createEnhancements(dbpslGraph, ci, text, language);
            if (log.isDebugEnabled()) {
                Serializer serializer = Serializer.getInstance();
                ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
                serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
                try {
                    log.debug("DBpedia Spotlight Spot Enhancements:\n{}", debugStream.toString("UTF-8"));
                } catch (UnsupportedEncodingException e) {
                    e.printStackTrace();
                }
            }
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
}
Also used : Language(org.apache.clerezza.commons.rdf.Language) SurfaceForm(org.apache.stanbol.enhancer.engines.dbpspotlight.model.SurfaceForm) UnsupportedEncodingException(java.io.UnsupportedEncodingException) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Serializer(org.apache.clerezza.rdf.core.serializedform.Serializer)

Example 18 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class CeliLemmatizerEnhancementEngine method addLemmatizationEnhancement.

private void addLemmatizationEnhancement(ContentItem ci, String text, String language, Graph g) throws EngineException {
    // clerezza language for PlainLiterals
    Language lang = new Language(language);
    String lemmatizedContents;
    try {
        lemmatizedContents = this.client.lemmatizeContents(text, language);
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
    }
    // get a write lock before writing the enhancements
    ci.getLock().writeLock().lock();
    try {
        IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
        g.add(new TripleImpl(textEnhancement, CeliLemmatizerEnhancementEngine.hasLemmaForm, new PlainLiteralImpl(lemmatizedContents, lang)));
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Language(org.apache.clerezza.commons.rdf.Language) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) SOAPException(javax.xml.soap.SOAPException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 19 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class CeliLemmatizerEnhancementEngine method addMorphoAnalysisEnhancement.

private void addMorphoAnalysisEnhancement(ContentItem ci, String text, String language, Graph g) throws EngineException {
    // clerezza language for PlainLiterals
    Language lang = new Language(language);
    List<LexicalEntry> terms;
    try {
        terms = this.client.performMorfologicalAnalysis(text, language);
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
    }
    // get a write lock before writing the enhancements
    ci.getLock().writeLock().lock();
    try {
        LiteralFactory literalFactory = LiteralFactory.getInstance();
        for (LexicalEntry le : terms) {
            List<CeliMorphoFeatures> mFeatures = this.convertLexicalEntryToMorphFeatures(le, language);
            for (CeliMorphoFeatures feat : mFeatures) {
                // Create a text annotation for each interpretation produced by the morphological analyzer
                IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(le.getWordForm(), lang)));
                if (le.from >= 0 && le.to > 0) {
                    g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(le.from)));
                    g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(le.to)));
                    g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, le.getWordForm(), le.from), lang)));
                }
                g.addAll(feat.featuresAsTriples(textAnnotation, lang));
            }
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : CeliMorphoFeatures(org.apache.stanbol.enhancer.engines.celi.CeliMorphoFeatures) IRI(org.apache.clerezza.commons.rdf.IRI) Language(org.apache.clerezza.commons.rdf.Language) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) SOAPException(javax.xml.soap.SOAPException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory)

Example 20 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class RestfulNlpAnalysisEngine method computeEnhancements.

/**
 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 * <p/>
 * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
 * stores it as a new part in the content item. The metadata is not changed.
 *
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
 */
@Override
public void computeEnhancements(final ContentItem ci) throws EngineException {
    // validate that the service is active
    checkRESTfulNlpAnalysisService();
    // get/create the AnalysedText
    final AnalysedText at = NlpEngineHelper.initAnalysedText(this, analysedTextFactory, ci);
    final Blob blob = at.getBlob();
    // send the text to the server
    final String language = getLanguage(this, ci, true);
    final HttpPost request = new HttpPost(analysisServiceUrl);
    request.addHeader(HttpHeaders.CONTENT_LANGUAGE, language);
    request.setEntity(new InputStreamEntity(blob.getStream(), blob.getContentLength(), ContentType.create(blob.getMimeType(), blob.getParameter().get("charset"))));
    // execute the request
    try {
        AccessController.doPrivileged(new PrivilegedExceptionAction<AnalysedText>() {

            public AnalysedText run() throws ClientProtocolException, IOException {
                return httpClient.execute(request, new AnalysisResponseHandler(at));
            }
        });
    } catch (PrivilegedActionException pae) {
        Exception e = pae.getException();
        if (e instanceof ClientProtocolException) {
            // force re-initialisation upon error
            setRESTfulNlpAnalysisServiceUnavailable();
            throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful NLP Analysis Service at " + analysisServiceUrl, e);
        } else if (e instanceof IOException) {
            // force re-initialisation upon error
            setRESTfulNlpAnalysisServiceUnavailable();
            throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful NLP Analysis Service at " + analysisServiceUrl, e);
        } else {
            throw RuntimeException.class.cast(e);
        }
    }
    if (writeTextAnnotations) {
        // if enabled fise:TextAnnotations are created for Named Entities and Sentiments
        double positiveSent = 0.0;
        int positiveCount = 0;
        double negativeSent = 0.0;
        int negativeCount = 0;
        int sentimentCount = 0;
        Iterator<Span> spans = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Chunk));
        Sentence context = null;
        Graph metadata = ci.getMetadata();
        Language lang = new Language(language);
        LiteralFactory lf = LiteralFactory.getInstance();
        ci.getLock().writeLock().lock();
        try {
            // write TextAnnotations for Named Entities
            while (spans.hasNext()) {
                Span span = spans.next();
                switch(span.getType()) {
                    case Sentence:
                        context = (Sentence) span;
                    // FALLThrough intended!!
                    default:
                        Value<NerTag> nerAnno = span.getAnnotation(NER_ANNOTATION);
                        if (nerAnno != null) {
                            IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
                            // add span related data
                            metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(span.getSpan(), lang)));
                            metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(span.getStart())));
                            metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(span.getEnd())));
                            metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(context == null ? getDefaultSelectionContext(at.getSpan(), span.getSpan(), span.getStart()) : context.getSpan(), lang)));
                            // add the NER type
                            if (nerAnno.value().getType() != null) {
                                metadata.add(new TripleImpl(ta, DC_TYPE, nerAnno.value().getType()));
                            }
                            if (nerAnno.probability() >= 0) {
                                metadata.add(new TripleImpl(ta, ENHANCER_CONFIDENCE, lf.createTypedLiteral(nerAnno.probability())));
                            }
                        }
                        Value<Double> sentimentAnnotation = span.getAnnotation(SENTIMENT_ANNOTATION);
                        if (sentimentAnnotation != null) {
                            // this span has a sentiment assigned
                            Double sentiment = sentimentAnnotation.value();
                            // Create a fise:TextAnnotation for the sentiment
                            IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
                            metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(span.getStart())));
                            metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(span.getEnd())));
                            metadata.add(new TripleImpl(ta, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment)));
                            // add the generic dc:type used for all Sentiment annotation
                            metadata.add(new TripleImpl(ta, DC_TYPE, SENTIMENT_TYPE));
                            // determine the specific dc:type for the sentiment annotation
                            IRI ssoType = NIFHelper.SPAN_TYPE_TO_SSO_TYPE.get(span.getType());
                            if (ssoType != null) {
                                metadata.add(new TripleImpl(ta, DC_TYPE, ssoType));
                            }
                            // keep statistics for the overall sentiment for the Document
                            sentimentCount++;
                            if (sentiment > 0) {
                                positiveSent += sentiment;
                                positiveCount++;
                            } else if (sentiment < 0) {
                                negativeSent += sentiment;
                                negativeCount++;
                            }
                        }
                        break;
                }
            }
            // Add the annotation for the overall sentiment of the document
            if (sentimentCount > 0) {
                IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
                // calculate the average sentiment for a document
                // TODO: Think on a better way to calculate a general sentiment value for a document.
                metadata.add(new TripleImpl(ta, SENTIMENT_PROPERTY, lf.createTypedLiteral((positiveSent + negativeSent) / sentimentCount)));
                if (positiveCount > 0) {
                    // average positive sentiment calculation for the document
                    metadata.add(new TripleImpl(ta, POSITIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(positiveSent / positiveCount)));
                }
                if (negativeCount > 0) {
                    // average negative sentiment calculation for the document
                    metadata.add(new TripleImpl(ta, NEGATIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(negativeSent / negativeCount)));
                }
                metadata.add(new TripleImpl(ta, DC_TYPE, SENTIMENT_TYPE));
                metadata.add(new TripleImpl(ta, DC_TYPE, DOCUMENT_SENTIMENT_TYPE));
            }
        // no sentiment annotation present ... nothing to do
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
// else do not write fise:TextAnnotations
}
Also used : HttpPost(org.apache.http.client.methods.HttpPost) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) IRI(org.apache.clerezza.commons.rdf.IRI) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Span(org.apache.stanbol.enhancer.nlp.model.Span) ClientProtocolException(org.apache.http.client.ClientProtocolException) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PrivilegedActionException(java.security.PrivilegedActionException) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) IOException(java.io.IOException) HttpException(org.apache.http.HttpException) ClientProtocolException(org.apache.http.client.ClientProtocolException) PrivilegedActionException(java.security.PrivilegedActionException) IOException(java.io.IOException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) URISyntaxException(java.net.URISyntaxException) ConfigurationException(org.osgi.service.cm.ConfigurationException) HttpResponseException(org.apache.http.client.HttpResponseException) InputStreamEntity(org.apache.http.entity.InputStreamEntity) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) Graph(org.apache.clerezza.commons.rdf.Graph)

Aggregations

Language (org.apache.clerezza.commons.rdf.Language)32 IRI (org.apache.clerezza.commons.rdf.IRI)24 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)20 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)19 Graph (org.apache.clerezza.commons.rdf.Graph)17 Literal (org.apache.clerezza.commons.rdf.Literal)12 ArrayList (java.util.ArrayList)8 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)8 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)8 IOException (java.io.IOException)7 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)7 HashSet (java.util.HashSet)5 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)5 NlpEngineHelper.getLanguage (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage)5 ByteArrayOutputStream (java.io.ByteArrayOutputStream)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 HashMap (java.util.HashMap)4 SOAPException (javax.xml.soap.SOAPException)4 Triple (org.apache.clerezza.commons.rdf.Triple)4