use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.
the class NEREngineCore method findNamedEntities.
protected void findNamedEntities(final ContentItem ci, final AnalysedText at, final String text, final String lang, final TokenNameFinderModel nameFinderModel) {
if (ci == null) {
throw new IllegalArgumentException("Parsed ContentItem MUST NOT be NULL");
}
if (at == null && text == null) {
log.warn("NULL was parsed as AnalysedText AND Text for content item " + ci.getUri() + ". One of the two MUST BE present! -> call ignored");
return;
}
final Language language;
if (lang != null && !lang.isEmpty()) {
language = new Language(lang);
} else {
language = null;
}
if (log.isDebugEnabled()) {
log.debug("findNamedEntities model={}, language={}, text=", new Object[] { nameFinderModel, language, StringUtils.abbreviate(at != null ? at.getSpan() : text, 100) });
}
LiteralFactory literalFactory = LiteralFactory.getInstance();
Graph g = ci.getMetadata();
Map<String, List<NameOccurrence>> entityNames;
if (at != null) {
entityNames = extractNameOccurrences(nameFinderModel, at, lang);
} else {
entityNames = extractNameOccurrences(nameFinderModel, text, lang);
}
// lock the ContentItem while writing the RDF data for found Named Entities
ci.getLock().writeLock().lock();
try {
Map<String, IRI> previousAnnotations = new LinkedHashMap<String, IRI>();
for (Map.Entry<String, List<NameOccurrence>> nameInContext : entityNames.entrySet()) {
String name = nameInContext.getKey();
List<NameOccurrence> occurrences = nameInContext.getValue();
IRI firstOccurrenceAnnotation = null;
for (NameOccurrence occurrence : occurrences) {
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(name, language)));
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.context, language)));
if (occurrence.type != null) {
g.add(new TripleImpl(textAnnotation, DC_TYPE, occurrence.type));
}
if (occurrence.confidence != null) {
g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(occurrence.confidence)));
}
if (occurrence.start != null && occurrence.end != null) {
g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(occurrence.start)));
g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(occurrence.end)));
}
// name
if (firstOccurrenceAnnotation == null) {
// specific occurrence
for (Map.Entry<String, IRI> entry : previousAnnotations.entrySet()) {
if (entry.getKey().contains(name)) {
// we have found a most specific previous
// occurrence, use it as subsumption target
firstOccurrenceAnnotation = entry.getValue();
g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
break;
}
}
if (firstOccurrenceAnnotation == null) {
// no most specific previous occurrence, I am the first,
// most specific occurrence to be later used as a target
firstOccurrenceAnnotation = textAnnotation;
previousAnnotations.put(name, textAnnotation);
}
} else {
// I am referring to a most specific first occurrence of the
// same name
g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
}
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.
the class TestMetaxaCore method printTriples.
/**
* This prints out the Stanbol Enhancer triples that would be created for the metadata
* contained in the given model.
*
* @param m a {@link Model}
*
* @return an {@code int} with the number of added triples
*/
private int printTriples(Model m) {
int tripleCounter = 0;
HashMap<BlankNode, BlankNode> blankNodeMap = new HashMap<BlankNode, BlankNode>();
ClosableIterator<Statement> it = m.iterator();
while (it.hasNext()) {
Statement oneStmt = it.next();
BlankNodeOrIRI subject = (BlankNodeOrIRI) MetaxaEngine.asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
IRI predicate = (IRI) MetaxaEngine.asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
RDFTerm object = MetaxaEngine.asClerezzaResource(oneStmt.getObject(), blankNodeMap);
if (null != subject && null != predicate && null != object) {
Triple t = new TripleImpl(subject, predicate, object);
LOG.debug("adding " + t);
tripleCounter++;
} else {
LOG.debug("skipped " + oneStmt.toString());
}
}
it.close();
return tripleCounter;
}
use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.
the class RestfulLangidentEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(final ContentItem ci) throws EngineException {
// get the plain text Blob
Map.Entry<IRI, Blob> textBlob = getPlainText(this, ci, false);
Blob blob = textBlob.getValue();
// send the text to the server
final HttpPost request = new HttpPost(serviceUrl);
request.setEntity(new InputStreamEntity(blob.getStream(), blob.getContentLength(), ContentType.create(blob.getMimeType(), blob.getParameter().get("charset"))));
// execute the request
List<LangSuggestion> detected;
try {
detected = AccessController.doPrivileged(new PrivilegedExceptionAction<List<LangSuggestion>>() {
public List<LangSuggestion> run() throws ClientProtocolException, IOException {
return httpClient.execute(request, new LangIdentResponseHandler(ci, objectMapper));
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof ClientProtocolException) {
throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful Language Identification Service at " + serviceUrl, e);
} else if (e instanceof IOException) {
throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful Language Identification Service at " + serviceUrl, e);
} else {
throw RuntimeException.class.cast(e);
}
}
Graph metadata = ci.getMetadata();
log.debug("Detected Languages for ContentItem {} and Blob {}");
ci.getLock().writeLock().lock();
try {
// write TextAnnotations for the detected languages
for (LangSuggestion suggestion : detected) {
// add a hypothesis
log.debug(" > {}@{}", suggestion.getLanguage(), suggestion.hasProbability() ? suggestion.getProbability() : "-,--");
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(suggestion.getLanguage())));
metadata.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
if (suggestion.hasProbability()) {
metadata.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getProbability())));
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.
the class Nif20MetadataEngine method writeSpan.
/**
* Writes basic information of the parsed span by using NIF 1.0 including the
* {@link SsoOntology} Sentence/Phrase/Word type based on
* the {@link Span#getType()}<p>
* As {@link AnalysedText} is based on the plain text version of the ContentItem
* this uses the {@link StringOntology#OffsetBasedString} notation.<p>
* <i>NOTE:</i> This DOES NOT write string relations, lemma, pos ... information
* that might be stored as {@link Annotation} with the parsed {@link Span}.
* @param graph the graph to add the triples
* @param base the base URI
* @param text the {@link AnalysedText}
* @param language the {@link Language} or <code>null</code> if not known
* @param span the {@link Span} to write.
* @return the {@link IRI} representing the parsed {@link Span} in the
* graph
*/
public IRI writeSpan(Graph graph, IRI base, AnalysedText text, Language language, Span span) {
IRI segment = Nif20Helper.getNifRFC5147URI(base, span.getStart(), span.getType() == SpanTypeEnum.Text ? -1 : span.getEnd());
if (!contextOnlyUriScheme || span.getType() == SpanTypeEnum.Text) {
graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.RFC5147String.getUri()));
}
if (writeSelectors) {
if (span.getEnd() - span.getStart() < 100) {
graph.add(new TripleImpl(segment, Nif20.anchorOf.getUri(), new PlainLiteralImpl(span.getSpan(), language)));
} else {
graph.add(new TripleImpl(segment, Nif20.head.getUri(), new PlainLiteralImpl(span.getSpan().substring(0, 10), language)));
}
graph.add(new TripleImpl(segment, Nif20.beginIndex.getUri(), lf.createTypedLiteral(span.getStart())));
graph.add(new TripleImpl(segment, Nif20.endIndex.getUri(), lf.createTypedLiteral(span.getEnd())));
String content = text.getSpan();
if (span.getType() != SpanTypeEnum.Text) {
// prefix and suffix
int prefixStart = Math.max(0, span.getStart() - DEFAULT_PREFIX_SUFFIX_LENGTH);
graph.add(new TripleImpl(segment, Nif20.before.getUri(), new PlainLiteralImpl(content.substring(prefixStart, span.getStart()), language)));
int suffixEnd = Math.min(span.getEnd() + DEFAULT_PREFIX_SUFFIX_LENGTH, text.getEnd());
graph.add(new TripleImpl(segment, Nif20.after.getUri(), new PlainLiteralImpl(content.substring(span.getEnd(), suffixEnd), language)));
}
}
if (writeStringType) {
graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.String.getUri()));
}
switch(span.getType()) {
case Token:
graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Word.getUri()));
break;
case Chunk:
graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Phrase.getUri()));
break;
case Sentence:
graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Sentence.getUri()));
break;
case Text:
graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Context.getUri()));
break;
default:
if (!writeStringType) {
graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.String.getUri()));
}
}
return segment;
}
use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.
the class Nif20Helper method writePos.
/**
* Writes the {@link NlpAnnotations#POS_ANNOTATION} as NIF 1.0 to the parsed
* RDF graph by using the parsed segmentUri as subject
* @param graph the graph
* @param annotated the annotated element (e.g. a {@link Token})
* @param segmentUri the URI of the resource representing the parsed
* annotated element in the graph
*/
public static void writePos(Graph graph, Annotated annotated, IRI segmentUri) {
Value<PosTag> posTag = annotated.getAnnotation(NlpAnnotations.POS_ANNOTATION);
if (posTag != null) {
if (posTag.value().isMapped()) {
for (Pos pos : posTag.value().getPos()) {
graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(), pos.getUri()));
}
for (LexicalCategory cat : posTag.value().getCategories()) {
graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(), cat.getUri()));
}
}
graph.add(new TripleImpl(segmentUri, Nif20.posTag.getUri(), lf.createTypedLiteral(posTag.value().getTag())));
// set the oliaConf
// remove existing conf values (e.g. for a single word phrase)
setOliaConf(graph, segmentUri, posTag);
}
}
Aggregations