use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class Nlp2RdfMetadataEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String lang = EnhancementEngineHelper.getLanguage(ci);
Language language = lang == null ? null : new Language(lang);
// now iterate over the AnalysedText data and create the RDF representation
// TODO: make configureable
boolean sentences = true;
boolean phrases = true;
boolean words = true;
EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
if (sentences) {
activeTypes.add(SpanTypeEnum.Sentence);
}
if (phrases) {
activeTypes.add(SpanTypeEnum.Chunk);
}
if (words) {
activeTypes.add(SpanTypeEnum.Token);
}
Graph metadata = ci.getMetadata();
IRI base = ci.getUri();
ci.getLock().writeLock().lock();
try {
Iterator<Span> spans = at.getEnclosed(activeTypes);
IRI sentence = null;
IRI phrase = null;
IRI word = null;
boolean firstWordInSentence = true;
while (spans.hasNext()) {
Span span = spans.next();
// TODO: filter Spans based on additional requirements
// (1) write generic information about the span
IRI current = writeSpan(metadata, base, at, language, span);
// (2) add the relations between the different spans
switch(span.getType()) {
case Sentence:
if (sentence != null) {
metadata.add(new TripleImpl(sentence, SsoOntology.nextSentence.getUri(), current));
}
sentence = current;
firstWordInSentence = true;
break;
case Chunk:
if (sentence != null) {
metadata.add(new TripleImpl(current, StringOntology.superString.getUri(), sentence));
if (word != null) {
metadata.add(new TripleImpl(word, SsoOntology.lastWord.getUri(), sentence));
}
}
phrase = current;
break;
case Token:
if (sentence != null) {
metadata.add(new TripleImpl(current, SsoOntology.sentence.getUri(), sentence));
if (firstWordInSentence) {
metadata.add(new TripleImpl(current, SsoOntology.firstWord.getUri(), sentence));
firstWordInSentence = false;
}
}
if (phrase != null) {
metadata.add(new TripleImpl(current, SsoOntology.parent.getUri(), phrase));
}
if (word != null) {
metadata.add(new TripleImpl(word, SsoOntology.nextWord.getUri(), current));
metadata.add(new TripleImpl(current, SsoOntology.previousWord.getUri(), word));
}
word = current;
break;
default:
break;
}
// (3) add specific information such as POS, chunk type ...
writePos(metadata, span, current);
writePhrase(metadata, span, current);
// OlIA does not include Sentiments
Value<Double> sentiment = span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
if (sentiment != null && sentiment.value() != null) {
metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment.value())));
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class NEREngineCore method findNamedEntities.
protected void findNamedEntities(final ContentItem ci, final AnalysedText at, final String text, final String lang, final TokenNameFinderModel nameFinderModel) {
if (ci == null) {
throw new IllegalArgumentException("Parsed ContentItem MUST NOT be NULL");
}
if (at == null && text == null) {
log.warn("NULL was parsed as AnalysedText AND Text for content item " + ci.getUri() + ". One of the two MUST BE present! -> call ignored");
return;
}
final Language language;
if (lang != null && !lang.isEmpty()) {
language = new Language(lang);
} else {
language = null;
}
if (log.isDebugEnabled()) {
log.debug("findNamedEntities model={}, language={}, text=", new Object[] { nameFinderModel, language, StringUtils.abbreviate(at != null ? at.getSpan() : text, 100) });
}
LiteralFactory literalFactory = LiteralFactory.getInstance();
Graph g = ci.getMetadata();
Map<String, List<NameOccurrence>> entityNames;
if (at != null) {
entityNames = extractNameOccurrences(nameFinderModel, at, lang);
} else {
entityNames = extractNameOccurrences(nameFinderModel, text, lang);
}
// lock the ContentItem while writing the RDF data for found Named Entities
ci.getLock().writeLock().lock();
try {
Map<String, IRI> previousAnnotations = new LinkedHashMap<String, IRI>();
for (Map.Entry<String, List<NameOccurrence>> nameInContext : entityNames.entrySet()) {
String name = nameInContext.getKey();
List<NameOccurrence> occurrences = nameInContext.getValue();
IRI firstOccurrenceAnnotation = null;
for (NameOccurrence occurrence : occurrences) {
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(name, language)));
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.context, language)));
if (occurrence.type != null) {
g.add(new TripleImpl(textAnnotation, DC_TYPE, occurrence.type));
}
if (occurrence.confidence != null) {
g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(occurrence.confidence)));
}
if (occurrence.start != null && occurrence.end != null) {
g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(occurrence.start)));
g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(occurrence.end)));
}
// name
if (firstOccurrenceAnnotation == null) {
// specific occurrence
for (Map.Entry<String, IRI> entry : previousAnnotations.entrySet()) {
if (entry.getKey().contains(name)) {
// we have found a most specific previous
// occurrence, use it as subsumption target
firstOccurrenceAnnotation = entry.getValue();
g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
break;
}
}
if (firstOccurrenceAnnotation == null) {
// no most specific previous occurrence, I am the first,
// most specific occurrence to be later used as a target
firstOccurrenceAnnotation = textAnnotation;
previousAnnotations.put(name, textAnnotation);
}
} else {
// I am referring to a most specific first occurrence of the
// same name
g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation));
}
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class IndexedGraphTest method createGraph.
private static void createGraph(Collection<Triple> tc, int triples, Long seed) {
Random rnd = new Random();
if (seed != null) {
rnd.setSeed(seed);
}
LiteralFactory lf = LiteralFactory.getInstance();
// randoms are in the range [0..3]
// literal
double l = 1.0;
// int
double i = l / 3;
// double
double d = l * 2 / 3;
// bNode
double b = 2.0;
// create new bNode
double nb = b - (l * 2 / 3);
double random;
BlankNodeOrIRI subject = null;
IRI predicate = null;
List<IRI> predicateList = new ArrayList<IRI>();
predicateList.add(RDF.first);
predicateList.add(RDF.rest);
predicateList.add(RDF.type);
predicateList.add(RDFS.label);
predicateList.add(RDFS.comment);
predicateList.add(RDFS.range);
predicateList.add(RDFS.domain);
predicateList.add(FOAF.name);
predicateList.add(FOAF.nick);
predicateList.add(FOAF.homepage);
predicateList.add(FOAF.age);
predicateList.add(FOAF.depiction);
String URI_PREFIX = "http://www.test.org/bigGraph/ref";
Language DE = new Language("de");
Language EN = new Language("en");
Iterator<IRI> predicates = predicateList.iterator();
List<BlankNode> bNodes = new ArrayList<BlankNode>();
bNodes.add(new BlankNode());
for (int count = 0; tc.size() < triples; count++) {
random = rnd.nextDouble() * 3;
if (random >= 2.5 || count == 0) {
if (random <= 2.75) {
subject = new IRI(URI_PREFIX + count);
} else {
int rndIndex = (int) ((random - 2.75) * bNodes.size() / (3.0 - 2.75));
subject = bNodes.get(rndIndex);
}
}
if (random > 2.0 || count == 0) {
if (!predicates.hasNext()) {
Collections.shuffle(predicateList, rnd);
predicates = predicateList.iterator();
}
predicate = predicates.next();
}
if (random <= l) {
// literal
if (random <= i) {
tc.add(new TripleImpl(subject, predicate, lf.createTypedLiteral(count)));
} else if (random <= d) {
tc.add(new TripleImpl(subject, predicate, lf.createTypedLiteral(random)));
} else {
Literal text;
if (random <= i) {
text = new PlainLiteralImpl("Literal for " + count);
} else if (random <= d) {
text = new PlainLiteralImpl("An English literal for " + count, EN);
} else {
text = new PlainLiteralImpl("Ein Deutsches Literal für " + count, DE);
}
tc.add(new TripleImpl(subject, predicate, text));
}
} else if (random <= b) {
// bnode
BlankNode bnode;
if (random <= nb) {
bnode = new BlankNode();
bNodes.add(bnode);
} else {
// >nb <b
int rndIndex = (int) ((random - nb) * bNodes.size() / (b - nb));
bnode = bNodes.get(rndIndex);
}
tc.add(new TripleImpl(subject, predicate, bnode));
} else {
// IRI
tc.add(new TripleImpl(subject, predicate, new IRI(URI_PREFIX + count * random)));
}
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class EntityCoMentionEngine method writeComentions.
private void writeComentions(ContentItem ci, Collection<LinkedEntity> comentions, String language, Set<IRI> textAnnotations) {
Language languageObject = null;
if (language != null && !language.isEmpty()) {
languageObject = new Language(language);
}
Graph metadata = ci.getMetadata();
// we MUST adjust the confidence level of existing annotations only once
// se we need to keep track of those
Set<BlankNodeOrIRI> adjustedSuggestions = new HashSet<BlankNodeOrIRI>();
log.debug("Write Co-Mentions:");
for (LinkedEntity comention : comentions) {
log.debug(" > {}", comention);
// URIs of TextAnnotations for the initial mention of this co-mention
Collection<IRI> initialMentions = new ArrayList<IRI>(comention.getSuggestions().size());
for (Suggestion suggestion : comention.getSuggestions()) {
Entity entity = suggestion.getEntity();
if (textAnnotations.contains(entity.getUri())) {
// if(entity.getData().filter(entity.getUri(),RDF_TYPE,ENHANCER_TEXTANNOTATION).hasNext()){
// this is a textAnnotation
initialMentions.add(entity.getUri());
}
// else TODO support also Entities!!
}
// create the TextAnnotations for the co-mention
for (Occurrence occurrence : comention.getOccurrences()) {
Literal startLiteral = literalFactory.createTypedLiteral(occurrence.getStart());
Literal endLiteral = literalFactory.createTypedLiteral(occurrence.getEnd());
// search for existing text annotation
boolean ignore = false;
// search for textAnnotations with the same end
IRI textAnnotation = null;
Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
while (it.hasNext()) {
Triple t = it.next();
Integer end = EnhancementEngineHelper.get(metadata, t.getSubject(), ENHANCER_END, Integer.class, literalFactory);
if (end != null && textAnnotations.contains(t.getSubject())) {
// metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
textAnnotation = (IRI) t.getSubject();
if (end > occurrence.getEnd()) {
// there is an other TextAnnotation selecting a bigger Span
// so we should ignore this Occurrence
ignore = true;
}
}
}
it = metadata.filter(null, ENHANCER_END, endLiteral);
while (it.hasNext()) {
Triple t = it.next();
Integer start = EnhancementEngineHelper.get(metadata, t.getSubject(), ENHANCER_START, Integer.class, literalFactory);
if (start != null && textAnnotations.contains(t.getSubject())) {
// metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
textAnnotation = (IRI) t.getSubject();
if (start < occurrence.getStart()) {
// there is an other TextAnnotation selecting a bigger Span
// so we should ignore this Occurrence
ignore = true;
}
}
}
if (!ignore) {
// collect confidence values of co-mentions
// maximum confidence of suggestions of the initial mention
Double maxConfidence = null;
// maximum confidence of existing suggestions
Double maxExistingConfidence = null;
if (textAnnotation == null) {
// not found ... create a new TextAnnotation for the co-mention
textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
// add it to the set of TextAnnotations
textAnnotations.add(textAnnotation);
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.getContext(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occurrence.getSelectedText(), languageObject)));
} else {
// if existing add this engine as contributor
metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
// maxConfidence = EnhancementEngineHelper.get(metadata, textAnnotation,
// ENHANCER_CONFIDENCE, Double.class, literalFactory);
}
// now process initial mention(s) for the co-mention
Set<IRI> dcTypes = new HashSet<IRI>();
for (IRI initialMention : initialMentions) {
// get the dc:type(s) of the initial mentions
Iterator<IRI> dcTypesIt = getReferences(metadata, initialMention, DC_TYPE);
while (dcTypesIt.hasNext()) {
dcTypes.add(dcTypesIt.next());
}
// check confidence of the initial mention (fise:TextAnnotation)
Double confidnece = EnhancementEngineHelper.get(metadata, initialMention, ENHANCER_CONFIDENCE, Double.class, literalFactory);
if (confidnece != null) {
if (maxConfidence == null) {
maxConfidence = confidnece;
} else if (maxConfidence.compareTo(confidnece) <= 0) {
maxConfidence = confidnece;
}
}
// else nothing to do
// now we need to compare the suggestions of the initial
// mention(s) with the existing one.
// Get information about the suggestions of the initial mention
Map<RDFTerm, Double> initialSuggestions = new HashMap<RDFTerm, Double>();
Map<RDFTerm, RDFTerm> initialSuggestedEntities = new HashMap<RDFTerm, RDFTerm>();
for (Iterator<Triple> suggestions = metadata.filter(null, DC_RELATION, initialMention); suggestions.hasNext(); ) {
if (!textAnnotations.contains(suggestions)) {
BlankNodeOrIRI suggestion = suggestions.next().getSubject();
RDFTerm suggestedEntity = EnhancementEngineHelper.getReference(metadata, suggestion, ENHANCER_ENTITY_REFERENCE);
if (suggestedEntity != null) {
// it has a suggestion
Double confidence = EnhancementEngineHelper.get(metadata, suggestion, ENHANCER_CONFIDENCE, Double.class, literalFactory);
if (maxConfidence == null) {
maxConfidence = confidence;
} else if (confidnece != null && maxConfidence.compareTo(confidnece) <= 0) {
maxConfidence = confidnece;
}
// else nothing to do
initialSuggestions.put(suggestion, confidence);
initialSuggestedEntities.put(suggestedEntity, suggestion);
}
// no suggestion (dc:relation to some other resource)
}
// else ignore dc:relation to other fise:TextAnnotations
}
// now we collect existing Suggestions for this TextAnnoation where we need
// to adjust the confidence (quite some things to check ....)
Map<BlankNodeOrIRI, Double> existingSuggestions = new HashMap<BlankNodeOrIRI, Double>();
if (maxConfidence != null && confidenceAdjustmentFactor < 1) {
// suggestions are defined by incoming dc:releation
for (Iterator<Triple> esIt = metadata.filter(null, DC_RELATION, textAnnotation); esIt.hasNext(); ) {
BlankNodeOrIRI existingSuggestion = esIt.next().getSubject();
// but not all of them are suggestions
if (!textAnnotations.contains(existingSuggestion)) {
// ignore fise:TextAnnotations
Double existingConfidence = EnhancementEngineHelper.get(metadata, existingSuggestion, ENHANCER_CONFIDENCE, Double.class, literalFactory);
// ignore fise:TextAnnotations also suggested for the initial mention
if (!initialSuggestions.containsKey(existingSuggestion)) {
RDFTerm suggestedEntity = EnhancementEngineHelper.getReference(metadata, existingSuggestion, ENHANCER_ENTITY_REFERENCE);
// suggestions for the initial mention
if (!initialSuggestedEntities.containsKey(suggestedEntity)) {
// finally make sure that we adjust confidences only once
if (!adjustedSuggestions.contains(existingSuggestion)) {
existingSuggestions.put(existingSuggestion, existingConfidence);
}
// else confidence already adjusted
} else {
// different fise:EntityAnnotation, but same reference Entity
// we need to check confidences to decide what to do
RDFTerm initialSuggestion = initialSuggestedEntities.get(suggestedEntity);
Double initialConfidence = initialSuggestions.get(initialSuggestion);
if (initialConfidence == null || (existingConfidence != null && existingConfidence.compareTo(initialConfidence) >= 0)) {
// existing confidence >= initial .. keep existing
initialSuggestions.remove(initialSuggestion);
if (maxExistingConfidence == null) {
maxExistingConfidence = existingConfidence;
} else if (maxExistingConfidence.compareTo(existingConfidence) <= 0) {
maxExistingConfidence = existingConfidence;
}
} else {
// adjust this one (if not yet adjusted)
if (!adjustedSuggestions.contains(existingSuggestion)) {
existingSuggestions.put(existingSuggestion, existingConfidence);
}
}
}
} else {
// a initial mention already present
// no need to process initial mention
initialSuggestions.remove(existingSuggestion);
if (maxExistingConfidence == null) {
maxExistingConfidence = existingConfidence;
} else if (existingConfidence != null && maxExistingConfidence.compareTo(existingConfidence) <= 0) {
maxExistingConfidence = existingConfidence;
}
// else maxExistingConfidence == null (undefined)
}
}
// else ignore dc:relations to other fise:TextAnnotations
}
for (Entry<BlankNodeOrIRI, Double> entry : existingSuggestions.entrySet()) {
if (entry.getValue() != null) {
double adjustedConfidence = entry.getValue() * confidenceAdjustmentFactor;
if (maxExistingConfidence == null || adjustedConfidence > maxExistingConfidence) {
maxExistingConfidence = adjustedConfidence;
}
EnhancementEngineHelper.set(metadata, entry.getKey(), ENHANCER_CONFIDENCE, adjustedConfidence, literalFactory);
// mark as adjusted
adjustedSuggestions.add(entry.getKey());
}
}
}
// add the suggestions of the initial mention to this one
for (RDFTerm suggestion : initialSuggestions.keySet()) {
metadata.add(new TripleImpl((BlankNodeOrIRI) suggestion, DC_RELATION, textAnnotation));
}
// finally link the co-mentation with the initial one
metadata.add(new TripleImpl(textAnnotation, DC_RELATION, initialMention));
// metadata.add(new TripleImpl(initialMention, DC_RELATION, textAnnotation));
}
// Adapt the dc:type values of the fise:TextAnnotation
// - if Suggestions added by this engine do have the max confidence
// use the dc:type values of the initial mention
// - if the original suggestions do have a higher confidence keep the
// existing
// - in case both do have the same confidence we add all dc:types
boolean removeExistingDcTypes = maxConfidence != null && (maxExistingConfidence == null || maxConfidence.compareTo(maxExistingConfidence) >= 0);
boolean addCoMentionDcTypes = maxExistingConfidence == null || (maxConfidence != null && maxConfidence.compareTo(maxExistingConfidence) >= 1);
Iterator<IRI> existingDcTypesIt = getReferences(metadata, textAnnotation, DC_TYPE);
while (existingDcTypesIt.hasNext()) {
// removeExistingDcTypes == true
if ((!dcTypes.remove(existingDcTypesIt.next()) || !addCoMentionDcTypes) && removeExistingDcTypes) {
// remove the dcType
existingDcTypesIt.remove();
}
}
if (addCoMentionDcTypes) {
for (IRI dcType : dcTypes) {
// add missing
metadata.add(new TripleImpl(textAnnotation, DC_TYPE, dcType));
}
}
// TODO: support also Entities
if (maxConfidence != null) {
// set the confidence value (if known)
EnhancementEngineHelper.set(metadata, textAnnotation, ENHANCER_CONFIDENCE, maxConfidence, literalFactory);
}
}
// else ignore this occurence
}
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class EnhancementRDFUtils method writeEntityAnnotation.
/**
* @param literalFactory
* the LiteralFactory to use
* @param graph
* the Graph to use
* @param contentItemId
* the contentItemId the enhancement is extracted from
* @param relatedEnhancements
* enhancements this textAnnotation is related to
* @param suggestion
* the entity suggestion
* @param nameField the field used to extract the name
* @param lang the preferred language to include or <code>null</code> if none
*/
public static IRI writeEntityAnnotation(EnhancementEngine engine, LiteralFactory literalFactory, Graph graph, IRI contentItemId, Collection<BlankNodeOrIRI> relatedEnhancements, Suggestion suggestion, String nameField, String lang) {
Representation rep = suggestion.getEntity().getRepresentation();
// 1. extract the "best label"
// Start with the matched one
Text label = suggestion.getMatchedLabel();
// if the matched label is not in the requested language
boolean langMatch = (lang == null && label.getLanguage() == null) || (label.getLanguage() != null && label.getLanguage().startsWith(lang));
// search if a better label is available for this Entity
if (!langMatch) {
Iterator<Text> labels = rep.getText(nameField);
while (labels.hasNext() && !langMatch) {
Text actLabel = labels.next();
langMatch = (lang == null && actLabel.getLanguage() == null) || (actLabel.getLanguage() != null && actLabel.getLanguage().startsWith(lang));
if (langMatch) {
// if the language matches ->
// override the matched label
label = actLabel;
}
}
}
// else the matched label will be the best to use
Literal literal;
if (label.getLanguage() == null) {
literal = new PlainLiteralImpl(label.getText());
} else {
literal = new PlainLiteralImpl(label.getText(), new Language(label.getLanguage()));
}
// Now create the entityAnnotation
IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(graph, engine, contentItemId);
// first relate this entity annotation to the text annotation(s)
for (BlankNodeOrIRI enhancement : relatedEnhancements) {
graph.add(new TripleImpl(entityAnnotation, DC_RELATION, enhancement));
}
IRI entityUri = new IRI(rep.getId());
// add the link to the referred entity
graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, entityUri));
// add the label parsed above
graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, literal));
if (suggestion.getScore() != null) {
graph.add(new TripleImpl(entityAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
}
Iterator<Reference> types = rep.getReferences(RDF_TYPE.getUnicodeString());
while (types.hasNext()) {
graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, new IRI(types.next().getReference())));
}
// add the name of the ReferencedSite that manages the Entity
if (suggestion.getEntity().getSite() != null) {
graph.add(new TripleImpl(entityAnnotation, new IRI(RdfResourceEnum.site.getUri()), new PlainLiteralImpl(suggestion.getEntity().getSite())));
}
return entityAnnotation;
}
Aggregations