use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class CeliNamedEntityExtractionEnhancementEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text = "";
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
return;
}
String language = EnhancementEngineHelper.getLanguage(ci);
if (language == null) {
throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
// used for the palin literals in TextAnnotations
Language lang = new Language(language);
try {
List<NamedEntity> lista = this.client.extractEntities(text, language);
LiteralFactory literalFactory = LiteralFactory.getInstance();
Graph g = ci.getMetadata();
for (NamedEntity ne : lista) {
try {
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
// add selected text as PlainLiteral in the language extracted from the text
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(ne.getFormKind(), lang)));
g.add(new TripleImpl(textAnnotation, DC_TYPE, getEntityRefForType(ne.type)));
if (ne.getFrom() != null && ne.getTo() != null) {
g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(ne.getFrom().intValue())));
g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(ne.getTo().intValue())));
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, ne.getFormKind(), ne.getFrom().intValue()), lang)));
}
} catch (NoConvertorException e) {
log.error(e.getMessage(), e);
}
}
} catch (IOException e) {
throw new EngineException("Error while calling the CELI NER (Named Entity Recognition)" + " service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI NER (Named Entity Recognition) service!", e);
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class SpotlightEngineUtils method createEntityAnnotation.
/**
* Creates a fise:EntityAnnotation for the parsed parameters and
* adds it the the {@link ContentItem#getMetadata()}. <p>
* This method assumes a write lock on the parsed content item.
* @param resource the candidate resource
* @param engine the engine
* @param ci the content item
* @param textAnnotation the fise:TextAnnotation to dc:relate the
* created fise:EntityAnnotation
* @return the URI of the created fise:TextAnnotation
*/
public static IRI createEntityAnnotation(CandidateResource resource, EnhancementEngine engine, ContentItem ci, IRI textAnnotation) {
IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, engine);
Graph model = ci.getMetadata();
Literal label = new PlainLiteralImpl(resource.label, new Language("en"));
model.add(new TripleImpl(entityAnnotation, DC_RELATION, textAnnotation));
model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, label));
model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, resource.getUri()));
model.add(new TripleImpl(entityAnnotation, PROPERTY_CONTEXTUAL_SCORE, literalFactory.createTypedLiteral(resource.contextualScore)));
model.add(new TripleImpl(entityAnnotation, PROPERTY_PERCENTAGE_OF_SECOND_RANK, literalFactory.createTypedLiteral(resource.percentageOfSecondRank)));
model.add(new TripleImpl(entityAnnotation, PROPERTY_SUPPORT, literalFactory.createTypedLiteral(resource.support)));
model.add(new TripleImpl(entityAnnotation, PROPERTY_PRIOR_SCORE, literalFactory.createTypedLiteral(resource.priorScore)));
model.add(new TripleImpl(entityAnnotation, PROPERTY_FINAL_SCORE, literalFactory.createTypedLiteral(resource.finalScore)));
return entityAnnotation;
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class DBPSpotlightSpotEnhancementEngine method computeEnhancements.
/**
* Calculate the enhancements by doing a POST request to the DBpedia
* Spotlight endpoint and processing the results
*
* @param ci
* the {@link ContentItem}
*/
public void computeEnhancements(ContentItem ci) throws EngineException {
Language language = SpotlightEngineUtils.getContentLanguage(ci);
String text = SpotlightEngineUtils.getPlainContent(ci);
Collection<SurfaceForm> dbpslGraph = doPostRequest(text, ci.getUri());
if (dbpslGraph != null) {
// Acquire a write lock on the ContentItem when adding the
// enhancements
ci.getLock().writeLock().lock();
try {
createEnhancements(dbpslGraph, ci, text, language);
if (log.isDebugEnabled()) {
Serializer serializer = Serializer.getInstance();
ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
try {
log.debug("DBpedia Spotlight Spot Enhancements:\n{}", debugStream.toString("UTF-8"));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class ResultSetToXml method createValueElement.
private Element createValueElement(RDFTerm resource, Document doc) {
Element value;
if (resource instanceof IRI) {
value = doc.createElement("uri");
value.appendChild(doc.createTextNode(((IRI) resource).getUnicodeString()));
} else if (resource instanceof Literal) {
value = doc.createElement("literal");
value.appendChild(doc.createTextNode(((Literal) resource).getLexicalForm()));
value.setAttribute("datatype", (((Literal) resource).getDataType().getUnicodeString()));
Language lang = ((Literal) resource).getLanguage();
if (lang != null) {
value.setAttribute("xml:lang", (lang.toString()));
}
} else {
value = doc.createElement("bnode");
value.appendChild(doc.createTextNode(resource.toString()));
}
return value;
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class EnhancementEngineHelperTest method testTextAnnotationNewModel.
@Test
public void testTextAnnotationNewModel() {
String content = "The Stanbol Enhancer can extract Entities form parsed Text.";
Language lang = new Language("en");
int start = content.indexOf("Stanbol");
int end = start + "Stanbol Enhancer".length();
IRI ciUri = new IRI("http://www.example.org/contentItem#1");
Graph metadata = new IndexedGraph();
IRI ta = EnhancementEngineHelper.createTextEnhancement(metadata, dummyEngine, ciUri);
EnhancementEngineHelper.setOccurrence(metadata, ta, content, start, end, lang, -1, true);
Assert.assertEquals("The ", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_PREFIX));
Assert.assertEquals("Stanbol Enhancer", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTED_TEXT));
Assert.assertEquals(" can extra", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_SUFFIX));
Assert.assertEquals(Integer.valueOf(start), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_START, Integer.class, lf));
Assert.assertEquals(Integer.valueOf(end), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_END, Integer.class, lf));
// head and tail should be null
Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_HEAD));
Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_TAIL));
content = "Ich habe den Schlüssel fürs Donaudampfschiffahrtsgesellschaftskapitänskajütenschloss verlohren.";
start = content.indexOf("Donaudampfschi");
end = content.indexOf(" verlohren");
ta = EnhancementEngineHelper.createTextEnhancement(metadata, dummyEngine, ciUri);
EnhancementEngineHelper.setOccurrence(metadata, ta, content, start, end, lang, -1, true);
Assert.assertEquals("ssel fürs ", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_PREFIX));
Assert.assertEquals(" verlohren", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_SUFFIX));
Assert.assertEquals(Integer.valueOf(start), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_START, Integer.class, lf));
Assert.assertEquals(Integer.valueOf(end), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_END, Integer.class, lf));
// selected text is expected to be null
Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTED_TEXT));
// tail and head should be present
Assert.assertEquals("Donaudampf", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_HEAD));
Assert.assertEquals("tenschloss", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_TAIL));
// finally the same but deactivating head/tail
ta = EnhancementEngineHelper.createTextEnhancement(metadata, dummyEngine, ciUri);
EnhancementEngineHelper.setOccurrence(metadata, ta, content, start, end, lang, -1, false);
Assert.assertEquals("Donaudampfschiffahrtsgesellschaftskapitänskajütenschloss", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTED_TEXT));
Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_HEAD));
Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_TAIL));
}
Aggregations