use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class EntityLinkingEngine method writeEnhancements.
/**
* Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
* extracted from the parsed ContentItem
* @param ci
* @param linkedEntities
* @param language
*/
private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language, boolean writeRankings) {
Language languageObject = null;
if (language != null && !language.isEmpty()) {
languageObject = new Language(language);
}
Set<IRI> dereferencedEntitis = new HashSet<IRI>();
Graph metadata = ci.getMetadata();
for (LinkedEntity linkedEntity : linkedEntities) {
Collection<IRI> textAnnotations = new ArrayList<IRI>(linkedEntity.getOccurrences().size());
// first create the TextAnnotations for the Occurrences
for (Occurrence occurrence : linkedEntity.getOccurrences()) {
Literal startLiteral = literalFactory.createTypedLiteral(occurrence.getStart());
Literal endLiteral = literalFactory.createTypedLiteral(occurrence.getEnd());
// search for existing text annotation
Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
IRI textAnnotation = null;
while (it.hasNext()) {
Triple t = it.next();
if (metadata.filter(t.getSubject(), ENHANCER_END, endLiteral).hasNext() && metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()) {
textAnnotation = (IRI) t.getSubject();
break;
}
}
if (textAnnotation == null) {
// not found ... create a new one
textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.getContext(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occurrence.getSelectedText(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(linkedEntity.getScore())));
} else {
// if existing add this engine as contributor
metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
}
// add dc:types (even to existing)
for (IRI dcType : linkedEntity.getTypes()) {
metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
}
textAnnotations.add(textAnnotation);
}
// now the EntityAnnotations for the Suggestions
for (Suggestion suggestion : linkedEntity.getSuggestions()) {
IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
// should we use the label used for the match, or search the
// representation for the best label ... currently its the matched one
Literal label = suggestion.getBestLabel(linkerConfig.getNameField(), language);
Entity entity = suggestion.getEntity();
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, label));
metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, entity.getUri()));
Iterator<IRI> suggestionTypes = entity.getReferences(linkerConfig.getTypeField());
while (suggestionTypes.hasNext()) {
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, suggestionTypes.next()));
}
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
for (IRI textAnnotation : textAnnotations) {
metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
}
// add origin information of the EntiySearcher
for (Entry<IRI, Collection<RDFTerm>> originInfo : entitySearcher.getOriginInformation().entrySet()) {
for (RDFTerm value : originInfo.getValue()) {
metadata.add(new TripleImpl(entityAnnotation, originInfo.getKey(), value));
}
}
if (writeRankings) {
Float ranking = suggestion.getEntity().getEntityRanking();
if (ranking != null) {
metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_RANKING, // write the float as double
new TypedLiteralImpl(ranking.toString(), XSD_DOUBLE)));
}
}
// add the RDF data for entities
if (linkerConfig.isDereferenceEntitiesEnabled() && dereferencedEntitis.add(entity.getUri())) {
// NOTE: do not add all triples as there might be other data in the graph
for (Iterator<Triple> triples = entity.getData().filter(entity.getUri(), null, null); triples.hasNext(); metadata.add(triples.next())) ;
}
}
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class DBPSpotlightCandidatesEnhancementEngine method computeEnhancements.
/**
* Calculate the enhancements by doing a POST request to the DBpedia
* Spotlight endpoint and processing the results
*
* @param ci
* the {@link ContentItem}
*/
public void computeEnhancements(ContentItem ci) throws EngineException {
Language language = SpotlightEngineUtils.getContentLanguage(ci);
String text = SpotlightEngineUtils.getPlainContent(ci);
Collection<SurfaceForm> dbpslGraph = doPostRequest(text, ci.getUri());
if (dbpslGraph != null) {
// Acquire a write lock on the ContentItem when adding the
// enhancements
ci.getLock().writeLock().lock();
try {
createEnhancements(dbpslGraph, ci, text, language);
if (log.isDebugEnabled()) {
Serializer serializer = Serializer.getInstance();
ByteArrayOutputStream debugStream = new ByteArrayOutputStream();
serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml");
try {
log.debug("DBpedia Spotlight Spot Enhancements:\n{}", debugStream.toString("UTF-8"));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class CeliLemmatizerEnhancementEngine method addLemmatizationEnhancement.
private void addLemmatizationEnhancement(ContentItem ci, String text, String language, Graph g) throws EngineException {
// clerezza language for PlainLiterals
Language lang = new Language(language);
String lemmatizedContents;
try {
lemmatizedContents = this.client.lemmatizeContents(text, language);
} catch (IOException e) {
throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
}
// get a write lock before writing the enhancements
ci.getLock().writeLock().lock();
try {
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textEnhancement, CeliLemmatizerEnhancementEngine.hasLemmaForm, new PlainLiteralImpl(lemmatizedContents, lang)));
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class CeliLemmatizerEnhancementEngine method addMorphoAnalysisEnhancement.
private void addMorphoAnalysisEnhancement(ContentItem ci, String text, String language, Graph g) throws EngineException {
// clerezza language for PlainLiterals
Language lang = new Language(language);
List<LexicalEntry> terms;
try {
terms = this.client.performMorfologicalAnalysis(text, language);
} catch (IOException e) {
throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
}
// get a write lock before writing the enhancements
ci.getLock().writeLock().lock();
try {
LiteralFactory literalFactory = LiteralFactory.getInstance();
for (LexicalEntry le : terms) {
List<CeliMorphoFeatures> mFeatures = this.convertLexicalEntryToMorphFeatures(le, language);
for (CeliMorphoFeatures feat : mFeatures) {
// Create a text annotation for each interpretation produced by the morphological analyzer
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(le.getWordForm(), lang)));
if (le.from >= 0 && le.to > 0) {
g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(le.from)));
g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(le.to)));
g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, le.getWordForm(), le.from), lang)));
}
g.addAll(feat.featuresAsTriples(textAnnotation, lang));
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class RestfulNlpAnalysisEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(final ContentItem ci) throws EngineException {
// validate that the service is active
checkRESTfulNlpAnalysisService();
// get/create the AnalysedText
final AnalysedText at = NlpEngineHelper.initAnalysedText(this, analysedTextFactory, ci);
final Blob blob = at.getBlob();
// send the text to the server
final String language = getLanguage(this, ci, true);
final HttpPost request = new HttpPost(analysisServiceUrl);
request.addHeader(HttpHeaders.CONTENT_LANGUAGE, language);
request.setEntity(new InputStreamEntity(blob.getStream(), blob.getContentLength(), ContentType.create(blob.getMimeType(), blob.getParameter().get("charset"))));
// execute the request
try {
AccessController.doPrivileged(new PrivilegedExceptionAction<AnalysedText>() {
public AnalysedText run() throws ClientProtocolException, IOException {
return httpClient.execute(request, new AnalysisResponseHandler(at));
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof ClientProtocolException) {
// force re-initialisation upon error
setRESTfulNlpAnalysisServiceUnavailable();
throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful NLP Analysis Service at " + analysisServiceUrl, e);
} else if (e instanceof IOException) {
// force re-initialisation upon error
setRESTfulNlpAnalysisServiceUnavailable();
throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful NLP Analysis Service at " + analysisServiceUrl, e);
} else {
throw RuntimeException.class.cast(e);
}
}
if (writeTextAnnotations) {
// if enabled fise:TextAnnotations are created for Named Entities and Sentiments
double positiveSent = 0.0;
int positiveCount = 0;
double negativeSent = 0.0;
int negativeCount = 0;
int sentimentCount = 0;
Iterator<Span> spans = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Chunk));
Sentence context = null;
Graph metadata = ci.getMetadata();
Language lang = new Language(language);
LiteralFactory lf = LiteralFactory.getInstance();
ci.getLock().writeLock().lock();
try {
// write TextAnnotations for Named Entities
while (spans.hasNext()) {
Span span = spans.next();
switch(span.getType()) {
case Sentence:
context = (Sentence) span;
// FALLThrough intended!!
default:
Value<NerTag> nerAnno = span.getAnnotation(NER_ANNOTATION);
if (nerAnno != null) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
// add span related data
metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(span.getSpan(), lang)));
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(span.getStart())));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(span.getEnd())));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(context == null ? getDefaultSelectionContext(at.getSpan(), span.getSpan(), span.getStart()) : context.getSpan(), lang)));
// add the NER type
if (nerAnno.value().getType() != null) {
metadata.add(new TripleImpl(ta, DC_TYPE, nerAnno.value().getType()));
}
if (nerAnno.probability() >= 0) {
metadata.add(new TripleImpl(ta, ENHANCER_CONFIDENCE, lf.createTypedLiteral(nerAnno.probability())));
}
}
Value<Double> sentimentAnnotation = span.getAnnotation(SENTIMENT_ANNOTATION);
if (sentimentAnnotation != null) {
// this span has a sentiment assigned
Double sentiment = sentimentAnnotation.value();
// Create a fise:TextAnnotation for the sentiment
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(span.getStart())));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(span.getEnd())));
metadata.add(new TripleImpl(ta, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment)));
// add the generic dc:type used for all Sentiment annotation
metadata.add(new TripleImpl(ta, DC_TYPE, SENTIMENT_TYPE));
// determine the specific dc:type for the sentiment annotation
IRI ssoType = NIFHelper.SPAN_TYPE_TO_SSO_TYPE.get(span.getType());
if (ssoType != null) {
metadata.add(new TripleImpl(ta, DC_TYPE, ssoType));
}
// keep statistics for the overall sentiment for the Document
sentimentCount++;
if (sentiment > 0) {
positiveSent += sentiment;
positiveCount++;
} else if (sentiment < 0) {
negativeSent += sentiment;
negativeCount++;
}
}
break;
}
}
// Add the annotation for the overall sentiment of the document
if (sentimentCount > 0) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
// calculate the average sentiment for a document
// TODO: Think on a better way to calculate a general sentiment value for a document.
metadata.add(new TripleImpl(ta, SENTIMENT_PROPERTY, lf.createTypedLiteral((positiveSent + negativeSent) / sentimentCount)));
if (positiveCount > 0) {
// average positive sentiment calculation for the document
metadata.add(new TripleImpl(ta, POSITIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(positiveSent / positiveCount)));
}
if (negativeCount > 0) {
// average negative sentiment calculation for the document
metadata.add(new TripleImpl(ta, NEGATIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(negativeSent / negativeCount)));
}
metadata.add(new TripleImpl(ta, DC_TYPE, SENTIMENT_TYPE));
metadata.add(new TripleImpl(ta, DC_TYPE, DOCUMENT_SENTIMENT_TYPE));
}
// no sentiment annotation present ... nothing to do
} finally {
ci.getLock().writeLock().unlock();
}
}
// else do not write fise:TextAnnotations
}
Aggregations