Search in sources :

Example 21 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class TextAnnotationsNewModelEngine method computeEnhancements.

/**
 * Computes the enhancements on the provided ContentItem.
 */
@Override
public void computeEnhancements(ContentItem contentItem) throws EngineException {
    Entry<IRI, Blob> textBlob = getBlob(contentItem, supportedMimeTypes);
    if (textBlob == null) {
        return;
    }
    String language = EnhancementEngineHelper.getLanguage(contentItem);
    Language lang = language == null ? null : new Language(language);
    String text;
    try {
        text = ContentItemHelper.getText(textBlob.getValue());
    } catch (IOException e) {
        throw new EngineException(this, contentItem, "Unable to read Plain Text Blob", e);
    }
    Set<Triple> addedTriples = new HashSet<Triple>();
    Graph metadata = contentItem.getMetadata();
    // extract all the necessary information within a read lock
    contentItem.getLock().readLock().lock();
    try {
        Iterator<Triple> it = metadata.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION);
        while (it.hasNext()) {
            BlankNodeOrIRI ta = it.next().getSubject();
            boolean hasPrefix = metadata.filter(ta, ENHANCER_SELECTION_PREFIX, null).hasNext();
            boolean hasSuffix = metadata.filter(ta, ENHANCER_SELECTION_SUFFIX, null).hasNext();
            boolean hasSelected = metadata.filter(ta, ENHANCER_SELECTED_TEXT, null).hasNext();
            if (hasPrefix && hasSuffix && hasSelected) {
                // this TextAnnotation already uses the new model
                continue;
            }
            Integer start;
            if (!hasPrefix) {
                start = EnhancementEngineHelper.get(metadata, ta, ENHANCER_START, Integer.class, lf);
                if (start == null) {
                    log.debug("unable to add fise:selection-prefix to TextAnnotation {} " + "because fise:start is not present", ta);
                } else if (start < 0) {
                    log.warn("fise:start {} of TextAnnotation {} < 0! " + "Will not transform this TextAnnotation", start, ta);
                    start = 0;
                }
            } else {
                start = null;
            }
            Integer end;
            if (!hasSuffix) {
                end = EnhancementEngineHelper.get(metadata, ta, ENHANCER_END, Integer.class, lf);
                if (end == null) {
                    log.debug("unable to add fise:selection-suffix to TextAnnotation {} " + "because fise:end is not present", ta);
                } else if (end > text.length()) {
                    log.warn("fise:end {} of TextAnnotation {} > as the content length {}! " + "Will not transform this TextAnnotation", end, ta, text.length());
                    end = null;
                } else if (start != null && end < start) {
                    log.warn("fise:end {} < fise:start {} of TextAnnotation {}! " + "Will not transform this TextAnnotation", end, start, ta);
                    end = null;
                    start = null;
                }
            } else {
                end = null;
            }
            if (!hasPrefix && start != null) {
                addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_PREFIX, new PlainLiteralImpl(text.substring(Math.max(0, start - prefixSuffixSize), start), lang)));
            }
            if (!hasSuffix && end != null) {
                addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_SUFFIX, new PlainLiteralImpl(text.substring(end, Math.min(text.length(), end + prefixSuffixSize)), lang)));
            }
            if (!hasSelected && start != null && end != null) {
                // This adds missing fise:selected or fise:head/fise:tail if the selected text is to long
                int length = end - start;
                if (length > 3 * prefixSuffixSize) {
                    // add prefix/suffix
                    addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_HEAD, new PlainLiteralImpl(text.substring(start, start + prefixSuffixSize), lang)));
                    addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_TAIL, new PlainLiteralImpl(text.substring(end - prefixSuffixSize, end), lang)));
                } else {
                    // add missing fise:selected
                    String selection = text.substring(start, end);
                    addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(selection, lang)));
                    // check if we should also add an selection context
                    if (!metadata.filter(ta, ENHANCER_SELECTION_CONTEXT, null).hasNext()) {
                        addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(EnhancementEngineHelper.getSelectionContext(text, selection, start), lang)));
                    }
                }
            }
        }
    } finally {
        contentItem.getLock().readLock().unlock();
    }
    // finally write the prefix/suffix triples within a write lock
    if (!addedTriples.isEmpty()) {
        contentItem.getLock().writeLock().lock();
        try {
            metadata.addAll(addedTriples);
        } finally {
            contentItem.getLock().writeLock().unlock();
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) ContentItemHelper.getBlob(org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper.getBlob) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) IOException(java.io.IOException) Triple(org.apache.clerezza.commons.rdf.Triple) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) HashSet(java.util.HashSet)

Example 22 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class Nif20MetadataEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String lang = EnhancementEngineHelper.getLanguage(ci);
    Language language = lang == null ? null : new Language(lang);
    // now iterate over the AnalysedText data and create the RDF representation
    // TODO: make configureable
    boolean sentences = true;
    boolean phrases = true;
    boolean words = true;
    EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
    if (sentences) {
        activeTypes.add(SpanTypeEnum.Sentence);
    }
    if (phrases) {
        activeTypes.add(SpanTypeEnum.Chunk);
    }
    if (words) {
        activeTypes.add(SpanTypeEnum.Token);
    }
    Graph metadata = ci.getMetadata();
    IRI base = ci.getUri();
    ci.getLock().writeLock().lock();
    try {
        // write the context
        IRI text = writeSpan(metadata, base, at, language, at);
        metadata.add(new TripleImpl(text, Nif20.sourceUrl.getUri(), ci.getUri()));
        Iterator<Span> spans = at.getEnclosed(activeTypes);
        IRI sentence = null;
        IRI phrase = null;
        IRI word = null;
        boolean firstWordInSentence = true;
        while (spans.hasNext()) {
            Span span = spans.next();
            // TODO: filter Spans based on additional requirements
            // (1) write generic information about the span
            IRI current = writeSpan(metadata, base, at, language, span);
            // write the context
            metadata.add(new TripleImpl(current, Nif20.referenceContext.getUri(), text));
            // (2) add the relations between the different spans
            switch(span.getType()) {
                case Sentence:
                    if (sentence != null && writePrevNext) {
                        metadata.add(new TripleImpl(sentence, Nif20.nextSentence.getUri(), current));
                        metadata.add(new TripleImpl(current, Nif20.previousSentence.getUri(), sentence));
                    }
                    if (word != null) {
                        metadata.add(new TripleImpl(sentence, Nif20.lastWord.getUri(), word));
                    }
                    sentence = current;
                    firstWordInSentence = true;
                    break;
                case Chunk:
                    if (sentence != null && writeHierary) {
                        metadata.add(new TripleImpl(current, Nif20.superString.getUri(), sentence));
                    }
                    phrase = current;
                    break;
                case Token:
                    if (sentence != null) {
                        if (writeHierary) {
                            metadata.add(new TripleImpl(current, Nif20.sentence.getUri(), sentence));
                        }
                        // metadata.add(new TripleImpl(sentence, Nif20.word.getUri(), current));
                        if (firstWordInSentence) {
                            metadata.add(new TripleImpl(sentence, Nif20.firstWord.getUri(), current));
                            firstWordInSentence = false;
                        }
                    }
                    if (writeHierary && phrase != null && !phrase.equals(current)) {
                        metadata.add(new TripleImpl(current, Nif20.subString.getUri(), phrase));
                    }
                    if (word != null && writePrevNext) {
                        metadata.add(new TripleImpl(word, Nif20.nextWord.getUri(), current));
                        metadata.add(new TripleImpl(current, Nif20.previousWord.getUri(), word));
                    }
                    word = current;
                    break;
                default:
                    break;
            }
            // (3) add specific information such as POS, chunk type ...
            Nif20Helper.writePhrase(metadata, span, current);
            Nif20Helper.writePos(metadata, span, current);
            // TODO: sentiment support
            Value<Double> sentiment = span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
            if (sentiment != null && sentiment.value() != null) {
                metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment.value())));
            }
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) SpanTypeEnum(org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum) Span(org.apache.stanbol.enhancer.nlp.model.Span) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 23 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class KeywordLinkingEngine method writeEnhancements.

/**
 * Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
 * extracted from the parsed ContentItem
 * @param ci
 * @param linkedEntities
 * @param language
 */
private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language) {
    Language languageObject = null;
    if (language != null && !language.isEmpty()) {
        languageObject = new Language(language);
    }
    Graph metadata = ci.getMetadata();
    for (LinkedEntity linkedEntity : linkedEntities) {
        Collection<IRI> textAnnotations = new ArrayList<IRI>(linkedEntity.getOccurrences().size());
        // first create the TextAnnotations for the Occurrences
        for (Occurrence occurrence : linkedEntity.getOccurrences()) {
            IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
            textAnnotations.add(textAnnotation);
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, literalFactory.createTypedLiteral(occurrence.getStart())));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, literalFactory.createTypedLiteral(occurrence.getEnd())));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.getContext(), languageObject)));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occurrence.getSelectedText(), languageObject)));
            metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(linkedEntity.getScore())));
            for (IRI dcType : linkedEntity.getTypes()) {
                metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
            }
        }
        // now the EntityAnnotations for the Suggestions
        for (Suggestion suggestion : linkedEntity.getSuggestions()) {
            IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            // should we use the label used for the match, or search the
            // representation for the best label ... currently its the matched one
            Text label = suggestion.getBestLabel(linkerConfig.getNameField(), language);
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, label.getLanguage() == null ? new PlainLiteralImpl(label.getText()) : new PlainLiteralImpl(label.getText(), new Language(label.getLanguage()))));
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_REFERENCE, new IRI(suggestion.getRepresentation().getId())));
            Iterator<Reference> suggestionTypes = suggestion.getRepresentation().getReferences(linkerConfig.getTypeField());
            while (suggestionTypes.hasNext()) {
                metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, new IRI(suggestionTypes.next().getReference())));
            }
            metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
            for (IRI textAnnotation : textAnnotations) {
                metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
            }
            // add the name of the ReferencedSite providing this suggestion
            metadata.add(new TripleImpl(entityAnnotation, new IRI(RdfResourceEnum.site.getUri()), new PlainLiteralImpl(referencedSiteName)));
            // add the RDF data for entities
            if (dereferenceEntitiesState) {
                metadata.addAll(RdfValueFactory.getInstance().toRdfRepresentation(suggestion.getRepresentation()).getRdfGraph());
            }
        }
    }
}
Also used : LinkedEntity(org.apache.stanbol.enhancer.engines.keywordextraction.impl.LinkedEntity) IRI(org.apache.clerezza.commons.rdf.IRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) Reference(org.apache.stanbol.entityhub.servicesapi.model.Reference) ArrayList(java.util.ArrayList) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) Suggestion(org.apache.stanbol.enhancer.engines.keywordextraction.impl.Suggestion) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Occurrence(org.apache.stanbol.enhancer.engines.keywordextraction.impl.LinkedEntity.Occurrence)

Example 24 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class Suggestion method getBestLabel.

/**
 * Getter for the best label in the given language
 * @param suggestion the suggestion
 * @param nameField the field used to search for labels
 * @param language the language
 * @return the best match or {@link Suggestion#getMatchedLabel()} if non is found
 */
public Literal getBestLabel(IRI nameField, String language) {
    Entity rep = getEntity();
    // start with the matched label -> so if we do not find a better one
    // we will use the matched!
    Literal matchedLabel = getMatchedLabel();
    Literal label = matchedLabel;
    // 1. check if the returned Entity does has a label -> if not return null
    // add labels (set only a single label. Use "en" if available!
    Iterator<Literal> labels = rep.getText(nameField);
    boolean matchFound = false;
    while (labels.hasNext() && !matchFound) {
        Literal actLabel = labels.next();
        if (label == null) {
            label = actLabel;
        }
        // now we have already a label check the language
        Language actLang = actLabel.getLanguage();
        // use startWith to match also en-GB and en-US ...
        if (actLang != null && actLang.toString().startsWith(language)) {
            // prefer labels with the correct language
            label = actLabel;
            if (matchedLabel != null && matchedLabel.getLexicalForm().equalsIgnoreCase(label.getLexicalForm())) {
                // found label in that language that exactly matches the
                // label used to match the text
                matchFound = true;
            }
        }
    }
    return label;
}
Also used : Entity(org.apache.stanbol.enhancer.engines.entitylinking.Entity) Language(org.apache.clerezza.commons.rdf.Language) Literal(org.apache.clerezza.commons.rdf.Literal)

Example 25 with Language

use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.

the class DBPSpotlightDisambiguateEnhancementTest method initTest.

@Before
public void initTest() throws IOException {
    // create the contentItem for testing
    ci = ciFactory.createContentItem(new StringSource(TEST_TEXT));
    assertNotNull(ci);
    textContentPart = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
    assertNotNull(textContentPart);
    // add the language of the text
    ci.getMetadata().add(new TripleImpl(ci.getUri(), Properties.DC_LANGUAGE, new PlainLiteralImpl("en")));
    assertEquals("en", EnhancementEngineHelper.getLanguage(ci));
    LiteralFactory lf = LiteralFactory.getInstance();
    // we need also to create a fise:TextAnnotation to test disambiguation
    String selected = "Angela Merkel";
    Language en = new Language("en");
    IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, new DBPSpotlightSpotEnhancementEngine());
    Graph model = ci.getMetadata();
    model.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(selected, en)));
    model.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(TEST_TEXT, en)));
    model.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, lf.createTypedLiteral(TEST_TEXT.indexOf(selected))));
    model.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, lf.createTypedLiteral(TEST_TEXT.indexOf(selected) + selected.length())));
    model.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, OntologicalClasses.DBPEDIA_PERSON));
    // validate that the created TextAnnotation is valid (test the test ...)
    EnhancementStructureHelper.validateAllTextAnnotations(model, TEST_TEXT, null);
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) DBPSpotlightSpotEnhancementEngine(org.apache.stanbol.enhancer.engines.dbpspotlight.spot.DBPSpotlightSpotEnhancementEngine) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) Before(org.junit.Before)

Aggregations

Language (org.apache.clerezza.commons.rdf.Language)32 IRI (org.apache.clerezza.commons.rdf.IRI)24 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)20 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)19 Graph (org.apache.clerezza.commons.rdf.Graph)17 Literal (org.apache.clerezza.commons.rdf.Literal)12 ArrayList (java.util.ArrayList)8 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)8 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)8 IOException (java.io.IOException)7 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)7 HashSet (java.util.HashSet)5 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)5 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)5 NlpEngineHelper.getLanguage (org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage)5 ByteArrayOutputStream (java.io.ByteArrayOutputStream)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 HashMap (java.util.HashMap)4 SOAPException (javax.xml.soap.SOAPException)4 Triple (org.apache.clerezza.commons.rdf.Triple)4