use of org.apache.clerezza.commons.rdf.BlankNodeOrIRI in project stanbol by apache.
the class TextAnnotationsNewModelEngine method computeEnhancements.
/**
* Computes the enhancements on the provided ContentItem.
*/
@Override
public void computeEnhancements(ContentItem contentItem) throws EngineException {
Entry<IRI, Blob> textBlob = getBlob(contentItem, supportedMimeTypes);
if (textBlob == null) {
return;
}
String language = EnhancementEngineHelper.getLanguage(contentItem);
Language lang = language == null ? null : new Language(language);
String text;
try {
text = ContentItemHelper.getText(textBlob.getValue());
} catch (IOException e) {
throw new EngineException(this, contentItem, "Unable to read Plain Text Blob", e);
}
Set<Triple> addedTriples = new HashSet<Triple>();
Graph metadata = contentItem.getMetadata();
//extract all the necessary information within a read lock
contentItem.getLock().readLock().lock();
try {
Iterator<Triple> it = metadata.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION);
while (it.hasNext()) {
BlankNodeOrIRI ta = it.next().getSubject();
boolean hasPrefix = metadata.filter(ta, ENHANCER_SELECTION_PREFIX, null).hasNext();
boolean hasSuffix = metadata.filter(ta, ENHANCER_SELECTION_SUFFIX, null).hasNext();
boolean hasSelected = metadata.filter(ta, ENHANCER_SELECTED_TEXT, null).hasNext();
if (hasPrefix && hasSuffix && hasSelected) {
//this TextAnnotation already uses the new model
continue;
}
Integer start;
if (!hasPrefix) {
start = EnhancementEngineHelper.get(metadata, ta, ENHANCER_START, Integer.class, lf);
if (start == null) {
log.debug("unable to add fise:selection-prefix to TextAnnotation {} " + "because fise:start is not present", ta);
} else if (start < 0) {
log.warn("fise:start {} of TextAnnotation {} < 0! " + "Will not transform this TextAnnotation", start, ta);
start = 0;
}
} else {
start = null;
}
Integer end;
if (!hasSuffix) {
end = EnhancementEngineHelper.get(metadata, ta, ENHANCER_END, Integer.class, lf);
if (end == null) {
log.debug("unable to add fise:selection-suffix to TextAnnotation {} " + "because fise:end is not present", ta);
} else if (end > text.length()) {
log.warn("fise:end {} of TextAnnotation {} > as the content length {}! " + "Will not transform this TextAnnotation", end, ta, text.length());
end = null;
} else if (start != null && end < start) {
log.warn("fise:end {} < fise:start {} of TextAnnotation {}! " + "Will not transform this TextAnnotation", end, start, ta);
end = null;
start = null;
}
} else {
end = null;
}
if (!hasPrefix && start != null) {
addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_PREFIX, new PlainLiteralImpl(text.substring(Math.max(0, start - prefixSuffixSize), start), lang)));
}
if (!hasSuffix && end != null) {
addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_SUFFIX, new PlainLiteralImpl(text.substring(end, Math.min(text.length(), end + prefixSuffixSize)), lang)));
}
if (!hasSelected && start != null && end != null) {
//This adds missing fise:selected or fise:head/fise:tail if the selected text is to long
int length = end - start;
if (length > 3 * prefixSuffixSize) {
//add prefix/suffix
addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_HEAD, new PlainLiteralImpl(text.substring(start, start + prefixSuffixSize), lang)));
addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_TAIL, new PlainLiteralImpl(text.substring(end - prefixSuffixSize, end), lang)));
} else {
//add missing fise:selected
String selection = text.substring(start, end);
addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(selection, lang)));
//check if we should also add an selection context
if (!metadata.filter(ta, ENHANCER_SELECTION_CONTEXT, null).hasNext()) {
addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(EnhancementEngineHelper.getSelectionContext(text, selection, start), lang)));
}
}
}
}
} finally {
contentItem.getLock().readLock().unlock();
}
//finally write the prefix/suffix triples within a write lock
if (!addedTriples.isEmpty()) {
contentItem.getLock().writeLock().lock();
try {
metadata.addAll(addedTriples);
} finally {
contentItem.getLock().writeLock().unlock();
}
}
}
use of org.apache.clerezza.commons.rdf.BlankNodeOrIRI in project stanbol by apache.
the class TextAnnotationNewModelEngineTest method testTextAnnotationNewModel.
@Test
public void testTextAnnotationNewModel() throws EngineException {
Assert.assertEquals(EnhancementEngine.ENHANCE_ASYNC, engine.canEnhance(contentItem));
engine.computeEnhancements(contentItem);
//validate
Graph g = contentItem.getMetadata();
Iterator<Triple> it = g.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION);
Assert.assertTrue(it.hasNext());
while (it.hasNext()) {
BlankNodeOrIRI ta = it.next().getSubject();
Assert.assertTrue(ta instanceof IRI);
Map<IRI, RDFTerm> expected = new HashMap<IRI, RDFTerm>();
expected.put(Properties.ENHANCER_EXTRACTED_FROM, contentItem.getUri());
EnhancementStructureHelper.validateTextAnnotation(g, (IRI) ta, SINGLE_SENTENCE, expected, true);
}
}
use of org.apache.clerezza.commons.rdf.BlankNodeOrIRI in project stanbol by apache.
the class TikaEngineTest method verifyValues.
private static Set<BlankNodeOrIRI> verifyValues(ContentItem ci, BlankNodeOrIRI subject, IRI property, BlankNodeOrIRI... references) {
Iterator<Triple> it = ci.getMetadata().filter(subject, property, null);
assertTrue(it.hasNext());
Set<BlankNodeOrIRI> expected = new HashSet<BlankNodeOrIRI>(Arrays.asList(references));
Set<BlankNodeOrIRI> found = new HashSet<BlankNodeOrIRI>(expected.size());
while (it.hasNext()) {
RDFTerm r = it.next().getObject();
assertTrue(r instanceof BlankNodeOrIRI);
assertTrue(expected.remove(r));
found.add((BlankNodeOrIRI) r);
}
return found;
}
use of org.apache.clerezza.commons.rdf.BlankNodeOrIRI in project stanbol by apache.
the class TikaEngineTest method testMp3.
@Test
public void testMp3() throws EngineException, IOException, ParseException {
log.info(">>> testMp3 <<<");
ContentItem ci = createContentItem("testMP3id3v24.mp3", "audio/mpeg");
assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
engine.computeEnhancements(ci);
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
assertNotNull(contentPart);
Blob plainTextBlob = contentPart.getValue();
assertNotNull(plainTextBlob);
assertContentRegexp(plainTextBlob, "Test Title", "Test Artist", "Test Album");
//validate XHTML results
contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
assertNotNull(contentPart);
Blob xhtmlBlob = contentPart.getValue();
assertNotNull(xhtmlBlob);
//Test AudioTrack metadata
BlankNodeOrIRI audioTrack = verifyBlankNodeOrIRI(ci, new IRI(NamespaceEnum.media + "hasTrack"));
//types
verifyValues(ci, audioTrack, RDF.type, new IRI(NamespaceEnum.media + "MediaFragment"), new IRI(NamespaceEnum.media + "Track"), new IRI(NamespaceEnum.media + "AudioTrack"));
//properties
verifyValue(ci, audioTrack, new IRI(NamespaceEnum.media + "hasFormat"), XSD.string, "Mono");
verifyValue(ci, audioTrack, new IRI(NamespaceEnum.media + "samplingRate"), XSD.int_, "44100");
verifyValue(ci, audioTrack, new IRI(NamespaceEnum.media + "hasCompression"), XSD.string, "MP3");
}
use of org.apache.clerezza.commons.rdf.BlankNodeOrIRI in project stanbol by apache.
the class TikaEngineTest method verifyValue.
private static IRI verifyValue(ContentItem ci, BlankNodeOrIRI subject, IRI property, IRI value) {
Iterator<Triple> it = ci.getMetadata().filter(subject, property, null);
assertTrue(it.hasNext());
RDFTerm r = it.next().getObject();
assertFalse(it.hasNext());
assertTrue(r instanceof IRI);
assertEquals(value, r);
return (IRI) r;
}
Aggregations