Search in sources :

Example 1 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class FstLinkingEngine method writeEnhancements.

/**
 * Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
 * extracted from the parsed ContentItem
 * @param ci
 * @param tags
 * @param language
 */
private void writeEnhancements(ContentItem ci, String text, Collection<Tag> tags, String language, boolean writeRankings) {
    Language languageObject = null;
    if (language != null && !language.isEmpty()) {
        languageObject = new Language(language);
    }
    Graph metadata = ci.getMetadata();
    for (Tag tag : tags) {
        Collection<IRI> textAnnotations = new ArrayList<IRI>(tags.size());
        // first create the TextAnnotations for the Occurrences
        Literal startLiteral = literalFactory.createTypedLiteral(tag.getStart());
        Literal endLiteral = literalFactory.createTypedLiteral(tag.getEnd());
        // search for existing text annotation
        Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
        IRI textAnnotation = null;
        while (it.hasNext()) {
            Triple t = it.next();
            if (metadata.filter(t.getSubject(), ENHANCER_END, endLiteral).hasNext() && metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()) {
                textAnnotation = (IRI) t.getSubject();
                break;
            }
        }
        if (textAnnotation == null) {
            // not found ... create a new one
            textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, tag.getAnchor(), tag.getStart()), languageObject)));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(tag.getAnchor(), languageObject)));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(tag.getScore())));
        } else {
            // if existing add this engine as contributor
            metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
        }
        // add dc:types (even to existing)
        for (IRI dcType : getDcTypes(tag.getSuggestions())) {
            metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
        }
        textAnnotations.add(textAnnotation);
        // now the EntityAnnotations for the Suggestions
        for (Match match : tag.getSuggestions()) {
            IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            // should we use the label used for the match, or search the
            // representation for the best label ... currently its the matched one
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, match.getMatchLabel()));
            metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, new IRI(match.getUri())));
            for (IRI type : match.getTypes()) {
                metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, type));
            }
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(match.getScore())));
            // add the relation to the fise:TextAnnotation (the tag)
            metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
            // write origin information
            if (indexConfig.getOrigin() != null) {
                metadata.add(new TripleImpl(entityAnnotation, FISE_ORIGIN, indexConfig.getOrigin()));
            }
            // }
            if (writeRankings) {
                Double ranking = match.getRanking();
                if (ranking != null) {
                    metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_RANKING, literalFactory.createTypedLiteral(ranking)));
                }
            }
        // TODO: dereferencing
        // if(linkerConfig.isDereferenceEntitiesEnabled() &&
        // dereferencedEntitis.add(entity.getUri())){ //not yet dereferenced
        // //add all outgoing triples for this entity
        // //NOTE: do not add all triples as there might be other data in the graph
        // for(Iterator<Triple> triples = entity.getData().filter(entity.getUri(), null, null);
        // triples.hasNext();metadata.add(triples.next()));
        // }
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) ArrayList(java.util.ArrayList) Triple(org.apache.clerezza.commons.rdf.Triple) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) Literal(org.apache.clerezza.commons.rdf.Literal) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 2 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class SentimentSummarizationEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    String language = NlpEngineHelper.getLanguage(this, ci, true);
    AnalysedText at = NlpEngineHelper.getAnalysedText(this, ci, true);
    // configure the spanTypes based on the configuration
    // EnumSet<Span.SpanTypeEnum> spanTypes = EnumSet.noneOf(SpanTypeEnum.class);
    // if(writeSentimentPhrases){
    // spanTypes.add(SpanTypeEnum.Chunk);
    // }
    // if(writeSentencesSentimet){
    // spanTypes.add(SpanTypeEnum.Sentence);
    // }
    // if(writeTextSectionSentiments){
    // spanTypes.add(SpanTypeEnum.TextSection);
    // }
    // if(writeTextSentiments ){
    // spanTypes.add(SpanTypeEnum.Text);
    // }
    List<SentimentPhrase> sentiments = extractSentiments(at, language);
    String detectedLang = EnhancementEngineHelper.getLanguage(ci);
    ci.getLock().writeLock().lock();
    try {
        writeSentimentEnhancements(ci, sentiments, at, detectedLang == null ? null : new Language(detectedLang));
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Language(org.apache.clerezza.commons.rdf.Language)

Example 3 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class RdfResourceUtils method getLiteralValues.

/**
 * Extracts the literal values for the given list of languages (<code>null</code>
 * is supported).
 * <p>
 * Multiple languages are supported by this method to allow parsing
 * <code>null</code> in addition to a language. This is often used by applications
 * to search for literals in a given language in addition to literals with no
 * defined language.
 * <p>
 * As a convenience this methods adds literals with a language tag to the
 * front of the list and literals with no language tag to the end.
 *
 * @param literals the iterator over the literals
 * @param languages the array of languages (<code>null</code> is supported).
 * @return The collection with all the literal values.
 */
public static List<String> getLiteralValues(Iterator<Literal> literals, String... languages) {
    // permits null element!
    Set<Language> languageSet = new HashSet<Language>();
    for (String lang : languages) {
        languageSet.add(getLanguage(lang));
    }
    boolean containsNull = languageSet.contains(null);
    List<String> results = new ArrayList<String>();
    while (literals.hasNext()) {
        Literal act = literals.next();
        if (act.getLanguage() != null) {
            if (languageSet.contains(act.getLanguage())) {
                // add to front
                results.add(0, act.getLexicalForm());
            }
        } else if (containsNull) {
            // add also all types Literals, because the do not define an language!
            // append to the end
            results.add(act.getLexicalForm());
        }
    }
    return results;
}
Also used : Language(org.apache.clerezza.commons.rdf.Language) Literal(org.apache.clerezza.commons.rdf.Literal) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet)

Example 4 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class DBPSpotlightDisambiguateEnhancementEngine method computeEnhancements.

/**
 * Calculate the enhancements by doing a POST request to the DBpedia
 * Spotlight endpoint and processing the results
 *
 * @param ci
 *            the {@link ContentItem}
 */
public void computeEnhancements(ContentItem ci) throws EngineException {
    Language language = SpotlightEngineUtils.getContentLanguage(ci);
    String text = SpotlightEngineUtils.getPlainContent(ci);
    // Retrieve the existing text annotations (requires read lock)
    Graph graph = ci.getMetadata();
    String xmlTextAnnotations = this.getSpottedXml(text, graph);
    Collection<Annotation> dbpslGraph = doPostRequest(text, xmlTextAnnotations, ci.getUri());
    if (dbpslGraph != null) {
        // Acquire a write lock on the ContentItem when adding the
        // enhancements
        ci.getLock().writeLock().lock();
        try {
            createEnhancements(dbpslGraph, ci, language);
            if (log.isDebugEnabled()) {
                Serializer serializer = Serializer.getInstance();
                ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
                serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
                try {
                    log.debug("DBpedia Enhancements:\n{}", debugStream.toString("UTF-8"));
                } catch (UnsupportedEncodingException e) {
                    e.printStackTrace();
                }
            }
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
}
Also used : Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) UnsupportedEncodingException(java.io.UnsupportedEncodingException) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Annotation(org.apache.stanbol.enhancer.engines.dbpspotlight.model.Annotation) Serializer(org.apache.clerezza.rdf.core.serializedform.Serializer)

Example 5 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class CeliSentimentAnalysisEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text = "";
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    if (text.trim().length() == 0) {
        log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
        return;
    }
    String language = EnhancementEngineHelper.getLanguage(ci);
    if (language == null) {
        throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    // used for the palin literals in TextAnnotations
    Language lang = new Language(language);
    try {
        List<SentimentExpression> lista = this.client.extractSentimentExpressions(text, language);
        LiteralFactory literalFactory = LiteralFactory.getInstance();
        Graph g = ci.getMetadata();
        for (SentimentExpression se : lista) {
            try {
                IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                // add selected text as PlainLiteral in the language extracted from the text
                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(se.getSnippetStr(), lang)));
                g.add(new TripleImpl(textAnnotation, DC_TYPE, CeliConstants.SENTIMENT_EXPRESSION));
                if (se.getStartSnippet() != null && se.getEndSnippet() != null) {
                    g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(se.getStartSnippet().intValue())));
                    g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(se.getEndSnippet().intValue())));
                    g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, se.getSnippetStr(), se.getStartSnippet()), lang)));
                    g.add(new TripleImpl(textAnnotation, CeliConstants.HAS_SENTIMENT_EXPRESSION_POLARITY, literalFactory.createTypedLiteral(se.getSentimentPolarityAsDoubleValue())));
                }
            } catch (NoConvertorException e) {
                log.error(e.getMessage(), e);
            }
        }
    } catch (IOException e) {
        throw new EngineException("Error while calling the CELI Sentiment Analysis service (configured URL: " + serviceURL + ")!", e);
    } catch (SOAPException e) {
        throw new EngineException("Error wile encoding/decoding the request/response to the CELI Sentiment Analysis service!", e);
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) NoConvertorException(org.apache.clerezza.rdf.core.NoConvertorException) SOAPException(javax.xml.soap.SOAPException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Aggregations

Language (org.apache.clerezza.commons.rdf.Language)32 IRI (org.apache.clerezza.commons.rdf.IRI)24 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)20 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)19 Graph (org.apache.clerezza.commons.rdf.Graph)17 Literal (org.apache.clerezza.commons.rdf.Literal)12 ArrayList (java.util.ArrayList)8 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)8 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)8 IOException (java.io.IOException)7 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)7 HashSet (java.util.HashSet)5 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)5 NlpEngineHelper.getLanguage (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage)5 ByteArrayOutputStream (java.io.ByteArrayOutputStream)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 HashMap (java.util.HashMap)4 SOAPException (javax.xml.soap.SOAPException)4 Triple (org.apache.clerezza.commons.rdf.Triple)4