use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class TextAnnotationsNewModelEngine method computeEnhancements.
/**
* Computes the enhancements on the provided ContentItem.
*/
@Override
public void computeEnhancements(ContentItem contentItem) throws EngineException {
Entry<IRI, Blob> textBlob = getBlob(contentItem, supportedMimeTypes);
if (textBlob == null) {
return;
}
String language = EnhancementEngineHelper.getLanguage(contentItem);
Language lang = language == null ? null : new Language(language);
String text;
try {
text = ContentItemHelper.getText(textBlob.getValue());
} catch (IOException e) {
throw new EngineException(this, contentItem, "Unable to read Plain Text Blob", e);
}
Set<Triple> addedTriples = new HashSet<Triple>();
Graph metadata = contentItem.getMetadata();
// extract all the necessary information within a read lock
contentItem.getLock().readLock().lock();
try {
Iterator<Triple> it = metadata.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION);
while (it.hasNext()) {
BlankNodeOrIRI ta = it.next().getSubject();
boolean hasPrefix = metadata.filter(ta, ENHANCER_SELECTION_PREFIX, null).hasNext();
boolean hasSuffix = metadata.filter(ta, ENHANCER_SELECTION_SUFFIX, null).hasNext();
boolean hasSelected = metadata.filter(ta, ENHANCER_SELECTED_TEXT, null).hasNext();
if (hasPrefix && hasSuffix && hasSelected) {
// this TextAnnotation already uses the new model
continue;
}
Integer start;
if (!hasPrefix) {
start = EnhancementEngineHelper.get(metadata, ta, ENHANCER_START, Integer.class, lf);
if (start == null) {
log.debug("unable to add fise:selection-prefix to TextAnnotation {} " + "because fise:start is not present", ta);
} else if (start < 0) {
log.warn("fise:start {} of TextAnnotation {} < 0! " + "Will not transform this TextAnnotation", start, ta);
start = 0;
}
} else {
start = null;
}
Integer end;
if (!hasSuffix) {
end = EnhancementEngineHelper.get(metadata, ta, ENHANCER_END, Integer.class, lf);
if (end == null) {
log.debug("unable to add fise:selection-suffix to TextAnnotation {} " + "because fise:end is not present", ta);
} else if (end > text.length()) {
log.warn("fise:end {} of TextAnnotation {} > as the content length {}! " + "Will not transform this TextAnnotation", end, ta, text.length());
end = null;
} else if (start != null && end < start) {
log.warn("fise:end {} < fise:start {} of TextAnnotation {}! " + "Will not transform this TextAnnotation", end, start, ta);
end = null;
start = null;
}
} else {
end = null;
}
if (!hasPrefix && start != null) {
addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_PREFIX, new PlainLiteralImpl(text.substring(Math.max(0, start - prefixSuffixSize), start), lang)));
}
if (!hasSuffix && end != null) {
addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_SUFFIX, new PlainLiteralImpl(text.substring(end, Math.min(text.length(), end + prefixSuffixSize)), lang)));
}
if (!hasSelected && start != null && end != null) {
// This adds missing fise:selected or fise:head/fise:tail if the selected text is to long
int length = end - start;
if (length > 3 * prefixSuffixSize) {
// add prefix/suffix
addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_HEAD, new PlainLiteralImpl(text.substring(start, start + prefixSuffixSize), lang)));
addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_TAIL, new PlainLiteralImpl(text.substring(end - prefixSuffixSize, end), lang)));
} else {
// add missing fise:selected
String selection = text.substring(start, end);
addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(selection, lang)));
// check if we should also add an selection context
if (!metadata.filter(ta, ENHANCER_SELECTION_CONTEXT, null).hasNext()) {
addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(EnhancementEngineHelper.getSelectionContext(text, selection, start), lang)));
}
}
}
}
} finally {
contentItem.getLock().readLock().unlock();
}
// finally write the prefix/suffix triples within a write lock
if (!addedTriples.isEmpty()) {
contentItem.getLock().writeLock().lock();
try {
metadata.addAll(addedTriples);
} finally {
contentItem.getLock().writeLock().unlock();
}
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class Nif20MetadataEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String lang = EnhancementEngineHelper.getLanguage(ci);
Language language = lang == null ? null : new Language(lang);
// now iterate over the AnalysedText data and create the RDF representation
// TODO: make configureable
boolean sentences = true;
boolean phrases = true;
boolean words = true;
EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
if (sentences) {
activeTypes.add(SpanTypeEnum.Sentence);
}
if (phrases) {
activeTypes.add(SpanTypeEnum.Chunk);
}
if (words) {
activeTypes.add(SpanTypeEnum.Token);
}
Graph metadata = ci.getMetadata();
IRI base = ci.getUri();
ci.getLock().writeLock().lock();
try {
// write the context
IRI text = writeSpan(metadata, base, at, language, at);
metadata.add(new TripleImpl(text, Nif20.sourceUrl.getUri(), ci.getUri()));
Iterator<Span> spans = at.getEnclosed(activeTypes);
IRI sentence = null;
IRI phrase = null;
IRI word = null;
boolean firstWordInSentence = true;
while (spans.hasNext()) {
Span span = spans.next();
// TODO: filter Spans based on additional requirements
// (1) write generic information about the span
IRI current = writeSpan(metadata, base, at, language, span);
// write the context
metadata.add(new TripleImpl(current, Nif20.referenceContext.getUri(), text));
// (2) add the relations between the different spans
switch(span.getType()) {
case Sentence:
if (sentence != null && writePrevNext) {
metadata.add(new TripleImpl(sentence, Nif20.nextSentence.getUri(), current));
metadata.add(new TripleImpl(current, Nif20.previousSentence.getUri(), sentence));
}
if (word != null) {
metadata.add(new TripleImpl(sentence, Nif20.lastWord.getUri(), word));
}
sentence = current;
firstWordInSentence = true;
break;
case Chunk:
if (sentence != null && writeHierary) {
metadata.add(new TripleImpl(current, Nif20.superString.getUri(), sentence));
}
phrase = current;
break;
case Token:
if (sentence != null) {
if (writeHierary) {
metadata.add(new TripleImpl(current, Nif20.sentence.getUri(), sentence));
}
// metadata.add(new TripleImpl(sentence, Nif20.word.getUri(), current));
if (firstWordInSentence) {
metadata.add(new TripleImpl(sentence, Nif20.firstWord.getUri(), current));
firstWordInSentence = false;
}
}
if (writeHierary && phrase != null && !phrase.equals(current)) {
metadata.add(new TripleImpl(current, Nif20.subString.getUri(), phrase));
}
if (word != null && writePrevNext) {
metadata.add(new TripleImpl(word, Nif20.nextWord.getUri(), current));
metadata.add(new TripleImpl(current, Nif20.previousWord.getUri(), word));
}
word = current;
break;
default:
break;
}
// (3) add specific information such as POS, chunk type ...
Nif20Helper.writePhrase(metadata, span, current);
Nif20Helper.writePos(metadata, span, current);
// TODO: sentiment support
Value<Double> sentiment = span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
if (sentiment != null && sentiment.value() != null) {
metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment.value())));
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class KeywordLinkingEngine method writeEnhancements.
/**
* Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
* extracted from the parsed ContentItem
* @param ci
* @param linkedEntities
* @param language
*/
private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language) {
Language languageObject = null;
if (language != null && !language.isEmpty()) {
languageObject = new Language(language);
}
Graph metadata = ci.getMetadata();
for (LinkedEntity linkedEntity : linkedEntities) {
Collection<IRI> textAnnotations = new ArrayList<IRI>(linkedEntity.getOccurrences().size());
// first create the TextAnnotations for the Occurrences
for (Occurrence occurrence : linkedEntity.getOccurrences()) {
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
textAnnotations.add(textAnnotation);
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, literalFactory.createTypedLiteral(occurrence.getStart())));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, literalFactory.createTypedLiteral(occurrence.getEnd())));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.getContext(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occurrence.getSelectedText(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(linkedEntity.getScore())));
for (IRI dcType : linkedEntity.getTypes()) {
metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
}
}
// now the EntityAnnotations for the Suggestions
for (Suggestion suggestion : linkedEntity.getSuggestions()) {
IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
// should we use the label used for the match, or search the
// representation for the best label ... currently its the matched one
Text label = suggestion.getBestLabel(linkerConfig.getNameField(), language);
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, label.getLanguage() == null ? new PlainLiteralImpl(label.getText()) : new PlainLiteralImpl(label.getText(), new Language(label.getLanguage()))));
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_REFERENCE, new IRI(suggestion.getRepresentation().getId())));
Iterator<Reference> suggestionTypes = suggestion.getRepresentation().getReferences(linkerConfig.getTypeField());
while (suggestionTypes.hasNext()) {
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, new IRI(suggestionTypes.next().getReference())));
}
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
for (IRI textAnnotation : textAnnotations) {
metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
}
// add the name of the ReferencedSite providing this suggestion
metadata.add(new TripleImpl(entityAnnotation, new IRI(RdfResourceEnum.site.getUri()), new PlainLiteralImpl(referencedSiteName)));
// add the RDF data for entities
if (dereferenceEntitiesState) {
metadata.addAll(RdfValueFactory.getInstance().toRdfRepresentation(suggestion.getRepresentation()).getRdfGraph());
}
}
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class Suggestion method getBestLabel.
/**
* Getter for the best label in the given language
* @param suggestion the suggestion
* @param nameField the field used to search for labels
* @param language the language
* @return the best match or {@link Suggestion#getMatchedLabel()} if non is found
*/
public Literal getBestLabel(IRI nameField, String language) {
Entity rep = getEntity();
// start with the matched label -> so if we do not find a better one
// we will use the matched!
Literal matchedLabel = getMatchedLabel();
Literal label = matchedLabel;
// 1. check if the returned Entity does has a label -> if not return null
// add labels (set only a single label. Use "en" if available!
Iterator<Literal> labels = rep.getText(nameField);
boolean matchFound = false;
while (labels.hasNext() && !matchFound) {
Literal actLabel = labels.next();
if (label == null) {
label = actLabel;
}
// now we have already a label check the language
Language actLang = actLabel.getLanguage();
// use startWith to match also en-GB and en-US ...
if (actLang != null && actLang.toString().startsWith(language)) {
// prefer labels with the correct language
label = actLabel;
if (matchedLabel != null && matchedLabel.getLexicalForm().equalsIgnoreCase(label.getLexicalForm())) {
// found label in that language that exactly matches the
// label used to match the text
matchFound = true;
}
}
}
return label;
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class DBPSpotlightDisambiguateEnhancementTest method initTest.
@Before
public void initTest() throws IOException {
// create the contentItem for testing
ci = ciFactory.createContentItem(new StringSource(TEST_TEXT));
assertNotNull(ci);
textContentPart = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
assertNotNull(textContentPart);
// add the language of the text
ci.getMetadata().add(new TripleImpl(ci.getUri(), Properties.DC_LANGUAGE, new PlainLiteralImpl("en")));
assertEquals("en", EnhancementEngineHelper.getLanguage(ci));
LiteralFactory lf = LiteralFactory.getInstance();
// we need also to create a fise:TextAnnotation to test disambiguation
String selected = "Angela Merkel";
Language en = new Language("en");
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, new DBPSpotlightSpotEnhancementEngine());
Graph model = ci.getMetadata();
model.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(selected, en)));
model.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(TEST_TEXT, en)));
model.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, lf.createTypedLiteral(TEST_TEXT.indexOf(selected))));
model.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, lf.createTypedLiteral(TEST_TEXT.indexOf(selected) + selected.length())));
model.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, OntologicalClasses.DBPEDIA_PERSON));
// validate that the created TextAnnotation is valid (test the test ...)
EnhancementStructureHelper.validateAllTextAnnotations(model, TEST_TEXT, null);
}
Aggregations