Search in sources :

Example 16 with TripleImpl

use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.

the class NEREngineCore method findNamedEntities.

protected void findNamedEntities(final ContentItem ci, final AnalysedText at, final String text, final String lang, final TokenNameFinderModel nameFinderModel) {
    if (ci == null) {
        throw new IllegalArgumentException("Parsed ContentItem MUST NOT be NULL");
    }
    if (at == null && text == null) {
        log.warn("NULL was parsed as AnalysedText AND Text for content item " + ci.getUri() + ". One of the two MUST BE present! -> call ignored");
        return;
    }
    final Language language;
    if (lang != null && !lang.isEmpty()) {
        language = new Language(lang);
    } else {
        language = null;
    }
    if (log.isDebugEnabled()) {
        log.debug("findNamedEntities model={},  language={}, text=", new Object[] { nameFinderModel, language, StringUtils.abbreviate(at != null ? at.getSpan() : text, 100) });
    }
    LiteralFactory literalFactory = LiteralFactory.getInstance();
    Graph g = ci.getMetadata();
    Map<String, List<NameOccurrence>> entityNames;
    if (at != null) {
        entityNames = extractNameOccurrences(nameFinderModel, at, lang);
    } else {
        entityNames = extractNameOccurrences(nameFinderModel, text, lang);
    }
    // lock the ContentItem while writing the RDF data for found Named Entities
    ci.getLock().writeLock().lock();
    try {
        Map<String, IRI> previousAnnotations = new LinkedHashMap<String, IRI>();
        for (Map.Entry<String, List<NameOccurrence>> nameInContext : entityNames.entrySet()) {
            String name = nameInContext.getKey();
            List<NameOccurrence> occurrences = nameInContext.getValue();
            IRI firstOccurrenceAnnotation = null;
            for (NameOccurrence occurrence : occurrences) {
                IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(name, language)));
                g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.context, language)));
                if (occurrence.type != null) {
                    g.add(new TripleImpl(textAnnotation, DC_TYPE, occurrence.type));
                }
                if (occurrence.confidence != null) {
                    g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(occurrence.confidence)));
                }
                if (occurrence.start != null && occurrence.end != null) {
                    g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(occurrence.start)));
                    g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(occurrence.end)));
                }
                // name
                if (firstOccurrenceAnnotation == null) {
                    // specific occurrence
                    for (Map.Entry<String, IRI> entry : previousAnnotations.entrySet()) {
                        if (entry.getKey().contains(name)) {
                            // we have found a most specific previous
                            // occurrence, use it as subsumption target
                            firstOccurrenceAnnotation = entry.getValue();
                            g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
                            break;
                        }
                    }
                    if (firstOccurrenceAnnotation == null) {
                        // no most specific previous occurrence, I am the first,
                        // most specific occurrence to be later used as a target
                        firstOccurrenceAnnotation = textAnnotation;
                        previousAnnotations.put(name, textAnnotation);
                    }
                } else {
                    // I am referring to a most specific first occurrence of the
                    // same name
                    g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
                }
            }
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) LinkedHashMap(java.util.LinkedHashMap) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) List(java.util.List) ArrayList(java.util.ArrayList) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap)

Example 17 with TripleImpl

use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.

the class TestMetaxaCore method printTriples.

/**
 * This prints out the Stanbol Enhancer triples that would be created for the metadata
 * contained in the given model.
 *
 * @param m a {@link Model}
 *
 * @return an {@code int} with the number of added triples
 */
private int printTriples(Model m) {
    int tripleCounter = 0;
    HashMap<BlankNode, BlankNode> blankNodeMap = new HashMap<BlankNode, BlankNode>();
    ClosableIterator<Statement> it = m.iterator();
    while (it.hasNext()) {
        Statement oneStmt = it.next();
        BlankNodeOrIRI subject = (BlankNodeOrIRI) MetaxaEngine.asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
        IRI predicate = (IRI) MetaxaEngine.asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
        RDFTerm object = MetaxaEngine.asClerezzaResource(oneStmt.getObject(), blankNodeMap);
        if (null != subject && null != predicate && null != object) {
            Triple t = new TripleImpl(subject, predicate, object);
            LOG.debug("adding " + t);
            tripleCounter++;
        } else {
            LOG.debug("skipped " + oneStmt.toString());
        }
    }
    it.close();
    return tripleCounter;
}
Also used : Triple(org.apache.clerezza.commons.rdf.Triple) IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) HashMap(java.util.HashMap) Statement(org.ontoware.rdf2go.model.Statement) BlankNode(org.apache.clerezza.commons.rdf.BlankNode) BlankNode(org.ontoware.rdf2go.model.node.BlankNode) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 18 with TripleImpl

use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.

the class RestfulLangidentEngine method computeEnhancements.

/**
 * Compute enhancements for supplied ContentItem. The results of the process
 * are expected to be stored in the metadata of the content item.
 * <p/>
 * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
 * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
 * <p/>
 * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
 * stores it as a new part in the content item. The metadata is not changed.
 *
 * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
 *          if the underlying process failed to work as
 *          expected
 */
@Override
public void computeEnhancements(final ContentItem ci) throws EngineException {
    // get the plain text Blob
    Map.Entry<IRI, Blob> textBlob = getPlainText(this, ci, false);
    Blob blob = textBlob.getValue();
    // send the text to the server
    final HttpPost request = new HttpPost(serviceUrl);
    request.setEntity(new InputStreamEntity(blob.getStream(), blob.getContentLength(), ContentType.create(blob.getMimeType(), blob.getParameter().get("charset"))));
    // execute the request
    List<LangSuggestion> detected;
    try {
        detected = AccessController.doPrivileged(new PrivilegedExceptionAction<List<LangSuggestion>>() {

            public List<LangSuggestion> run() throws ClientProtocolException, IOException {
                return httpClient.execute(request, new LangIdentResponseHandler(ci, objectMapper));
            }
        });
    } catch (PrivilegedActionException pae) {
        Exception e = pae.getException();
        if (e instanceof ClientProtocolException) {
            throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful Language Identification Service at " + serviceUrl, e);
        } else if (e instanceof IOException) {
            throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful Language Identification Service at " + serviceUrl, e);
        } else {
            throw RuntimeException.class.cast(e);
        }
    }
    Graph metadata = ci.getMetadata();
    log.debug("Detected Languages for ContentItem {} and Blob {}");
    ci.getLock().writeLock().lock();
    try {
        // write TextAnnotations for the detected languages
        for (LangSuggestion suggestion : detected) {
            // add a hypothesis
            log.debug(" > {}@{}", suggestion.getLanguage(), suggestion.hasProbability() ? suggestion.getProbability() : "-,--");
            IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
            metadata.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(suggestion.getLanguage())));
            metadata.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
            if (suggestion.hasProbability()) {
                metadata.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getProbability())));
            }
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) HttpPost(org.apache.http.client.methods.HttpPost) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PrivilegedActionException(java.security.PrivilegedActionException) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) PrivilegedExceptionAction(java.security.PrivilegedExceptionAction) IOException(java.io.IOException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) URISyntaxException(java.net.URISyntaxException) ConfigurationException(org.osgi.service.cm.ConfigurationException) HttpException(org.apache.http.HttpException) ClientProtocolException(org.apache.http.client.ClientProtocolException) PrivilegedActionException(java.security.PrivilegedActionException) HttpResponseException(org.apache.http.client.HttpResponseException) IOException(java.io.IOException) InputStreamEntity(org.apache.http.entity.InputStreamEntity) ClientProtocolException(org.apache.http.client.ClientProtocolException) Graph(org.apache.clerezza.commons.rdf.Graph) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Map(java.util.Map) HashMap(java.util.HashMap)

Example 19 with TripleImpl

use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.

the class Nif20MetadataEngine method writeSpan.

/**
 * Writes basic information of the parsed span by using NIF 1.0 including the
 * {@link SsoOntology} Sentence/Phrase/Word type based on
 * the {@link Span#getType()}<p>
 * As {@link AnalysedText} is based on the plain text version of the ContentItem
 * this uses the {@link StringOntology#OffsetBasedString} notation.<p>
 * <i>NOTE:</i> This DOES NOT write string relations, lemma, pos ... information
 * that might be stored as {@link Annotation} with the parsed {@link Span}.
 * @param graph the graph to add the triples
 * @param base the base URI
 * @param text the {@link AnalysedText}
 * @param language the {@link Language} or <code>null</code> if not known
 * @param span the {@link Span} to write.
 * @return the {@link IRI} representing the parsed {@link Span} in the
 * graph
 */
public IRI writeSpan(Graph graph, IRI base, AnalysedText text, Language language, Span span) {
    IRI segment = Nif20Helper.getNifRFC5147URI(base, span.getStart(), span.getType() == SpanTypeEnum.Text ? -1 : span.getEnd());
    if (!contextOnlyUriScheme || span.getType() == SpanTypeEnum.Text) {
        graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.RFC5147String.getUri()));
    }
    if (writeSelectors) {
        if (span.getEnd() - span.getStart() < 100) {
            graph.add(new TripleImpl(segment, Nif20.anchorOf.getUri(), new PlainLiteralImpl(span.getSpan(), language)));
        } else {
            graph.add(new TripleImpl(segment, Nif20.head.getUri(), new PlainLiteralImpl(span.getSpan().substring(0, 10), language)));
        }
        graph.add(new TripleImpl(segment, Nif20.beginIndex.getUri(), lf.createTypedLiteral(span.getStart())));
        graph.add(new TripleImpl(segment, Nif20.endIndex.getUri(), lf.createTypedLiteral(span.getEnd())));
        String content = text.getSpan();
        if (span.getType() != SpanTypeEnum.Text) {
            // prefix and suffix
            int prefixStart = Math.max(0, span.getStart() - DEFAULT_PREFIX_SUFFIX_LENGTH);
            graph.add(new TripleImpl(segment, Nif20.before.getUri(), new PlainLiteralImpl(content.substring(prefixStart, span.getStart()), language)));
            int suffixEnd = Math.min(span.getEnd() + DEFAULT_PREFIX_SUFFIX_LENGTH, text.getEnd());
            graph.add(new TripleImpl(segment, Nif20.after.getUri(), new PlainLiteralImpl(content.substring(span.getEnd(), suffixEnd), language)));
        }
    }
    if (writeStringType) {
        graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.String.getUri()));
    }
    switch(span.getType()) {
        case Token:
            graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Word.getUri()));
            break;
        case Chunk:
            graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Phrase.getUri()));
            break;
        case Sentence:
            graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Sentence.getUri()));
            break;
        case Text:
            graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Context.getUri()));
            break;
        default:
            if (!writeStringType) {
                graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.String.getUri()));
            }
    }
    return segment;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 20 with TripleImpl

use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.

the class Nif20Helper method writePos.

/**
 * Writes the {@link NlpAnnotations#POS_ANNOTATION} as NIF 1.0 to the parsed
 * RDF graph by using the parsed segmentUri as subject
 * @param graph the graph
 * @param annotated the annotated element (e.g. a {@link Token})
 * @param segmentUri the URI of the resource representing the parsed
 * annotated element in the graph
 */
public static void writePos(Graph graph, Annotated annotated, IRI segmentUri) {
    Value<PosTag> posTag = annotated.getAnnotation(NlpAnnotations.POS_ANNOTATION);
    if (posTag != null) {
        if (posTag.value().isMapped()) {
            for (Pos pos : posTag.value().getPos()) {
                graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(), pos.getUri()));
            }
            for (LexicalCategory cat : posTag.value().getCategories()) {
                graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(), cat.getUri()));
            }
        }
        graph.add(new TripleImpl(segmentUri, Nif20.posTag.getUri(), lf.createTypedLiteral(posTag.value().getTag())));
        // set the oliaConf
        // remove existing conf values (e.g. for a single word phrase)
        setOliaConf(graph, segmentUri, posTag);
    }
}
Also used : PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Pos(org.apache.stanbol.enhancer.nlp.pos.Pos) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) LexicalCategory(org.apache.stanbol.enhancer.nlp.pos.LexicalCategory)

Aggregations

TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)143 IRI (org.apache.clerezza.commons.rdf.IRI)104 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)69 Graph (org.apache.clerezza.commons.rdf.Graph)66 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)49 Triple (org.apache.clerezza.commons.rdf.Triple)41 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)26 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)23 HashMap (java.util.HashMap)20 Language (org.apache.clerezza.commons.rdf.Language)20 Literal (org.apache.clerezza.commons.rdf.Literal)20 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)20 IOException (java.io.IOException)18 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)17 Test (org.junit.Test)16 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)15 IndexedGraph (org.apache.stanbol.commons.indexedgraph.IndexedGraph)14 HashSet (java.util.HashSet)13 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)13 BlankNode (org.apache.clerezza.commons.rdf.BlankNode)11