use of org.apache.clerezza.commons.rdf.IRI in project stanbol by apache.
the class CeliLemmatizerEnhancementEngineTest method testEngine.
@Test
public void testEngine() throws Exception {
ContentItem ci = wrapAsContentItem(TEXT);
// add a simple triple to statically define the language of the test
// content
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("it")));
// unit test should not depend on each other (if possible)
// CeliLanguageIdentifierEnhancementEngineTest.addEnanchements(ci);
CeliLemmatizerEnhancementEngine morphoAnalysisEngine = initEngine(false);
try {
morphoAnalysisEngine.computeEnhancements(ci);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
return;
}
TestUtils.logEnhancements(ci);
// validate enhancement
HashMap<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(morphoAnalysisEngine.getClass().getName()));
Iterator<Triple> lemmaTextAnnotationIterator = ci.getMetadata().filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION);
assertTrue("A TextAnnotation is expected by this Test", lemmaTextAnnotationIterator.hasNext());
BlankNodeOrIRI lemmaTextAnnotation = lemmaTextAnnotationIterator.next().getSubject();
assertTrue("TextAnnoations MUST BE IRIs!", lemmaTextAnnotation instanceof IRI);
assertFalse("Only a single TextAnnotation is expected by this Test", lemmaTextAnnotationIterator.hasNext());
// validate the enhancement metadata
validateEnhancement(ci.getMetadata(), (IRI) lemmaTextAnnotation, expectedValues);
// validate the lemma form TextAnnotation
int lemmaForms = validateLemmaFormProperty(ci.getMetadata(), lemmaTextAnnotation, "it");
assertTrue("Only a single LemmaForm property is expected if '" + MORPHOLOGICAL_ANALYSIS + "=false'", lemmaForms == 1);
shutdownEngine(morphoAnalysisEngine);
}
use of org.apache.clerezza.commons.rdf.IRI in project stanbol by apache.
the class CeliClassificationEnhancementEngineTest method tesetEngine.
@Test
public void tesetEngine() throws Exception {
ContentItem ci = wrapAsContentItem(TEXT);
try {
// add a simple triple to statically define the language of the test
// content
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("fr")));
// unit test should not depend on each other (if possible)
// CeliLanguageIdentifierEnhancementEngineTest.addEnanchements(ci);
classificationEngine.computeEnhancements(ci);
TestUtils.logEnhancements(ci);
HashMap<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(classificationEngine.getClass().getName()));
int textAnnoNum = EnhancementStructureHelper.validateAllTextAnnotations(ci.getMetadata(), TEXT, expectedValues);
assertEquals("Only a single fise:TextAnnotation is expeted", 1, textAnnoNum);
int numTopicAnnotations = validateAllTopicAnnotations(ci.getMetadata(), expectedValues);
assertTrue("No TpocisAnnotations found", numTopicAnnotations > 0);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
return;
}
}
use of org.apache.clerezza.commons.rdf.IRI in project stanbol by apache.
the class CeliNamedEntityExtractionEnhancementEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text = "";
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
return;
}
String language = EnhancementEngineHelper.getLanguage(ci);
if (language == null) {
throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
// used for the palin literals in TextAnnotations
Language lang = new Language(language);
try {
List<NamedEntity> lista = this.client.extractEntities(text, language);
LiteralFactory literalFactory = LiteralFactory.getInstance();
Graph g = ci.getMetadata();
for (NamedEntity ne : lista) {
try {
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
// add selected text as PlainLiteral in the language extracted from the text
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(ne.getFormKind(), lang)));
g.add(new TripleImpl(textAnnotation, DC_TYPE, getEntityRefForType(ne.type)));
if (ne.getFrom() != null && ne.getTo() != null) {
g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(ne.getFrom().intValue())));
g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(ne.getTo().intValue())));
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, ne.getFormKind(), ne.getFrom().intValue()), lang)));
}
} catch (NoConvertorException e) {
log.error(e.getMessage(), e);
}
}
} catch (IOException e) {
throw new EngineException("Error while calling the CELI NER (Named Entity Recognition)" + " service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI NER (Named Entity Recognition) service!", e);
}
}
use of org.apache.clerezza.commons.rdf.IRI in project stanbol by apache.
the class DBPSpotlightSpotEnhancementEngine method createEnhancements.
/**
* The method adds the returned DBpedia Spotlight surface forms to the
* content item's metadata. For each one an TextAnnotation is created.
*
* @param occs
* a Collection of entity information
* @param ci
* the content item
*/
protected void createEnhancements(Collection<SurfaceForm> occs, ContentItem ci, String content, Language lang) {
HashMap<String, IRI> entityAnnotationMap = new HashMap<String, IRI>();
Graph model = ci.getMetadata();
for (SurfaceForm occ : occs) {
IRI textAnnotation = SpotlightEngineUtils.createTextEnhancement(occ, this, ci, content, lang);
if (entityAnnotationMap.containsKey(occ.name)) {
model.add(new TripleImpl(entityAnnotationMap.get(occ.name), DC_RELATION, textAnnotation));
} else {
entityAnnotationMap.put(occ.name, textAnnotation);
}
}
}
use of org.apache.clerezza.commons.rdf.IRI in project stanbol by apache.
the class DereferenceContext method parseEntityReferences.
private void parseEntityReferences(Object value) throws DereferenceConfigurationException {
Collection<String> entityRefProps;
try {
entityRefProps = EnhancementEngineHelper.parseConfigValues(value, String.class);
} catch (IllegalStateException e) {
throw new DereferenceConfigurationException(e, engine.getDereferencer().getClass(), DereferenceConstants.ENTITY_REFERENCES);
}
// start with the references present in the config
this.entityReferences = new HashSet<IRI>(getConfig().getEntityReferences());
if (entityRefProps != null && !entityRefProps.isEmpty()) {
NamespacePrefixService nps = engine.getConfig().getNsPrefixService();
for (String prop : entityRefProps) {
if (!StringUtils.isBlank(prop)) {
try {
entityReferences.add(new IRI(NamespaceMappingUtils.getConfiguredUri(nps, prop)));
} catch (IllegalArgumentException e) {
throw new DereferenceConfigurationException(e, engine.getDereferencer().getClass(), DereferenceConstants.ENTITY_REFERENCES);
}
}
}
}
}
Aggregations