use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class RestfulNlpAnalysisEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(final ContentItem ci) throws EngineException {
//validate that the service is active
checkRESTfulNlpAnalysisService();
//get/create the AnalysedText
final AnalysedText at = NlpEngineHelper.initAnalysedText(this, analysedTextFactory, ci);
final Blob blob = at.getBlob();
//send the text to the server
final String language = getLanguage(this, ci, true);
final HttpPost request = new HttpPost(analysisServiceUrl);
request.addHeader(HttpHeaders.CONTENT_LANGUAGE, language);
request.setEntity(new InputStreamEntity(blob.getStream(), blob.getContentLength(), ContentType.create(blob.getMimeType(), blob.getParameter().get("charset"))));
//execute the request
try {
AccessController.doPrivileged(new PrivilegedExceptionAction<AnalysedText>() {
public AnalysedText run() throws ClientProtocolException, IOException {
return httpClient.execute(request, new AnalysisResponseHandler(at));
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof ClientProtocolException) {
//force re-initialisation upon error
setRESTfulNlpAnalysisServiceUnavailable();
throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful NLP Analysis Service at " + analysisServiceUrl, e);
} else if (e instanceof IOException) {
//force re-initialisation upon error
setRESTfulNlpAnalysisServiceUnavailable();
throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful NLP Analysis Service at " + analysisServiceUrl, e);
} else {
throw RuntimeException.class.cast(e);
}
}
if (writeTextAnnotations) {
//if enabled fise:TextAnnotations are created for Named Entities and Sentiments
double positiveSent = 0.0;
int positiveCount = 0;
double negativeSent = 0.0;
int negativeCount = 0;
int sentimentCount = 0;
Iterator<Span> spans = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Chunk));
Sentence context = null;
Graph metadata = ci.getMetadata();
Language lang = new Language(language);
LiteralFactory lf = LiteralFactory.getInstance();
ci.getLock().writeLock().lock();
try {
//write TextAnnotations for Named Entities
while (spans.hasNext()) {
Span span = spans.next();
switch(span.getType()) {
case Sentence:
context = (Sentence) span;
//FALLThrough intended!!
default:
Value<NerTag> nerAnno = span.getAnnotation(NER_ANNOTATION);
if (nerAnno != null) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
//add span related data
metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(span.getSpan(), lang)));
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(span.getStart())));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(span.getEnd())));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(context == null ? getDefaultSelectionContext(at.getSpan(), span.getSpan(), span.getStart()) : context.getSpan(), lang)));
//add the NER type
if (nerAnno.value().getType() != null) {
metadata.add(new TripleImpl(ta, DC_TYPE, nerAnno.value().getType()));
}
if (nerAnno.probability() >= 0) {
metadata.add(new TripleImpl(ta, ENHANCER_CONFIDENCE, lf.createTypedLiteral(nerAnno.probability())));
}
}
Value<Double> sentimentAnnotation = span.getAnnotation(SENTIMENT_ANNOTATION);
if (sentimentAnnotation != null) {
//this span has a sentiment assigned
Double sentiment = sentimentAnnotation.value();
//Create a fise:TextAnnotation for the sentiment
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(span.getStart())));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(span.getEnd())));
metadata.add(new TripleImpl(ta, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment)));
//add the generic dc:type used for all Sentiment annotation
metadata.add(new TripleImpl(ta, DC_TYPE, SENTIMENT_TYPE));
//determine the specific dc:type for the sentiment annotation
IRI ssoType = NIFHelper.SPAN_TYPE_TO_SSO_TYPE.get(span.getType());
if (ssoType != null) {
metadata.add(new TripleImpl(ta, DC_TYPE, ssoType));
}
//keep statistics for the overall sentiment for the Document
sentimentCount++;
if (sentiment > 0) {
positiveSent += sentiment;
positiveCount++;
} else if (sentiment < 0) {
negativeSent += sentiment;
negativeCount++;
}
}
break;
}
}
//Add the annotation for the overall sentiment of the document
if (sentimentCount > 0) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
//calculate the average sentiment for a document
//TODO: Think on a better way to calculate a general sentiment value for a document.
metadata.add(new TripleImpl(ta, SENTIMENT_PROPERTY, lf.createTypedLiteral((positiveSent + negativeSent) / sentimentCount)));
if (positiveCount > 0) {
//average positive sentiment calculation for the document
metadata.add(new TripleImpl(ta, POSITIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(positiveSent / positiveCount)));
}
if (negativeCount > 0) {
//average negative sentiment calculation for the document
metadata.add(new TripleImpl(ta, NEGATIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(negativeSent / negativeCount)));
}
metadata.add(new TripleImpl(ta, DC_TYPE, SENTIMENT_TYPE));
metadata.add(new TripleImpl(ta, DC_TYPE, DOCUMENT_SENTIMENT_TYPE));
}
// no sentiment annotation present ... nothing to do
} finally {
ci.getLock().writeLock().unlock();
}
}
//else do not write fise:TextAnnotations
}
use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class TikaEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
MediaTypeAndStream mtas = extractMediaType(ci);
if (mtas.mediaType == null) {
//unable to parse and detect content type
return;
}
MediaType plainMediaType = mtas.mediaType.getBaseType();
if (plainMediaType.equals(MediaType.TEXT_PLAIN)) {
//we need not to process plain text!
return;
}
final ParseContext context = new ParseContext();
context.set(Parser.class, parser);
Set<MediaType> supproted = parser.getSupportedTypes(context);
if (supproted.contains(plainMediaType)) {
final InputStream in;
if (mtas.in == null) {
in = ci.getStream();
} else {
in = mtas.in;
}
final Metadata metadata = new Metadata();
//set the already parsed contentType
metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
//also explicitly set the charset as contentEncoding
String charset = mtas.mediaType.getParameters().get("charset");
if (charset != null) {
metadata.set(Metadata.CONTENT_ENCODING, charset);
}
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink(TEXT_PLAIN + "; charset=" + UTF8.name());
} catch (IOException e) {
//close the input stream
IOUtils.closeQuietly(in);
throw new EngineException("Error while initialising Blob for" + "writing the text/plain version of the parsed content", e);
}
final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
final ContentHandler textHandler = new //only the Body
BodyContentHandler(//skip ignoreable
new PlainTextHandler(plainTextWriter, false, skipLinebreaks));
final ToXMLContentHandler xhtmlHandler;
final ContentHandler mainHandler;
ContentSink xhtmlSink = null;
try {
if (!plainMediaType.equals(XHTML)) {
//do not parse XHTML from XHTML
try {
xhtmlSink = ciFactory.createContentSink(XHTML + "; charset=" + UTF8.name());
} catch (IOException e) {
throw new EngineException("Error while initialising Blob for" + "writing the application/xhtml+xml version of the parsed content", e);
}
try {
xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(), UTF8.name());
} catch (UnsupportedEncodingException e) {
throw new EngineException("This system does not support the encoding " + UTF8, e);
}
mainHandler = new MultiHandler(textHandler, xhtmlHandler);
} else {
mainHandler = textHandler;
xhtmlHandler = null;
xhtmlSink = null;
}
try {
AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
public Object run() throws IOException, SAXException, TikaException {
/*
* We need to replace the context Classloader with the Bundle ClassLoader
* to ensure that Singleton instances of XML frameworks (such as node4j)
* do not leak into the OSGI environment.
*
* Most Java XML libs prefer to load implementations by using the
* {@link Thread#getContextClassLoader()}. However OSGI has no control over
* this {@link ClassLoader}. Because of that there can be situations where
* Interfaces are loaded via the Bundle Classloader and the implementations
* are taken from the context Classloader. What can cause
* {@link ClassCastException}, {@link ExceptionInInitializerError}s, ...
*
* Setting the context Classloader to the Bundle classloader helps to avoid
* those situations.
*/
ClassLoader contextClassLoader = updateContextClassLoader();
try {
parser.parse(in, mainHandler, metadata, context);
} finally {
//reset the previous context ClassLoader
Thread.currentThread().setContextClassLoader(contextClassLoader);
}
return null;
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof IOException || e instanceof SAXException || e instanceof TikaException) {
throw new EngineException("Unable to convert ContentItem " + ci.getUri() + " with mimeType '" + ci.getMimeType() + "' to " + "plain text!", e);
} else {
//runtime exception
throw RuntimeException.class.cast(e);
}
}
} finally {
//ensure that the writers are closed correctly
IOUtils.closeQuietly(in);
IOUtils.closeQuietly(plainTextWriter);
if (xhtmlSink != null) {
IOUtils.closeQuietly(xhtmlSink.getOutputStream());
}
}
String random = randomUUID().toString();
IRI textBlobUri = new IRI("urn:tika:text:" + random);
ci.addPart(textBlobUri, plainTextSink.getBlob());
if (xhtmlHandler != null) {
IRI xhtmlBlobUri = new IRI("urn:tika:xhtml:" + random);
ci.addPart(xhtmlBlobUri, xhtmlSink.getBlob());
}
//add the extracted metadata
if (log.isInfoEnabled()) {
for (String name : metadata.names()) {
log.info("{}: {}", name, Arrays.toString(metadata.getValues(name)));
}
}
ci.getLock().writeLock().lock();
try {
Graph graph = ci.getMetadata();
IRI id = ci.getUri();
Set<String> mapped = ontologyMappings.apply(graph, id, metadata);
if (includeUnmappedProperties) {
Set<String> unmapped = new HashSet<String>(Arrays.asList(metadata.names()));
unmapped.removeAll(mapped);
for (String name : unmapped) {
if (name.indexOf(':') >= 0 || includeAllUnmappedProperties) {
//only mapped
IRI prop = new IRI(new StringBuilder(TIKA_URN_PREFIX).append(name).toString());
for (String value : metadata.getValues(name)) {
//TODO: without the Property for the name we have no datatype
// information ... so we add PlainLiterals for now
graph.add(new TripleImpl(id, prop, new PlainLiteralImpl(value)));
}
}
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
//else not supported format
}
use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class RdfRepresentationTest method testPlainLiteralToTextConversion.
/*--------------------------------------------------------------------------
* Additional Tests for special Features of the Clerezza based implementation
*
* This includes mainly support for additional types like PlainLiteral,
* TypedLiteral, IRIs. The conversion to such types as well as getter for
* such types.
*--------------------------------------------------------------------------
*/
/**
* {@link PlainLiteral} is used for natural language text in the Clerezza
* RDF API. This tests if adding {@link PlainLiteral}s to the
* {@link Representation#add(String, Object)} method makes them available
* as {@link Text} instances via the {@link Representation} API (e.g.
* {@link Representation#get(String, String...)}).
*/
@Test
public void testPlainLiteralToTextConversion() {
String field = "urn:test.RdfRepresentation:test.field";
Literal noLangLiteral = new PlainLiteralImpl("A plain literal without Language");
Literal enLiteral = new PlainLiteralImpl("An english literal", new Language("en"));
Literal deLiteral = new PlainLiteralImpl("Ein Deutsches Literal", new Language("de"));
Literal deATLiteral = new PlainLiteralImpl("Ein Topfen Verband hilft bei Zerrungen", new Language("de-AT"));
Collection<Literal> plainLiterals = Arrays.asList(noLangLiteral, enLiteral, deLiteral, deATLiteral);
Representation rep = createRepresentation(null);
rep.add(field, plainLiterals);
//now test, that the Plain Literals are available as natural language
//tests via the Representation Interface!
//1) one without a language
Iterator<Text> noLangaugeTexts = rep.get(field, (String) null);
assertTrue(noLangaugeTexts.hasNext());
Text noLanguageText = noLangaugeTexts.next();
assertEquals(noLangLiteral.getLexicalForm(), noLanguageText.getText());
assertNull(noLanguageText.getLanguage());
//only a single result
assertFalse(noLangaugeTexts.hasNext());
//2) one with a language
Iterator<Text> enLangaugeTexts = rep.get(field, "en");
assertTrue(enLangaugeTexts.hasNext());
Text enLangageText = enLangaugeTexts.next();
assertEquals(enLiteral.getLexicalForm(), enLangageText.getText());
assertEquals(enLiteral.getLanguage().toString(), enLangageText.getLanguage());
//only a single result
assertFalse(enLangaugeTexts.hasNext());
//3) test to get all natural language values
Set<String> stringValues = new HashSet<String>();
for (Literal plainLiteral : plainLiterals) {
stringValues.add(plainLiteral.getLexicalForm());
}
Iterator<Text> texts = rep.getText(field);
while (texts.hasNext()) {
assertTrue(stringValues.remove(texts.next().getText()));
}
assertTrue(stringValues.isEmpty());
}
use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class StringAtom method adapt.
@SuppressWarnings("unchecked")
@Override
public <T> T adapt(RuleAtom ruleAtom) throws RuleAtomCallExeption {
org.apache.stanbol.rules.manager.atoms.StringAtom tmp = (org.apache.stanbol.rules.manager.atoms.StringAtom) ruleAtom;
String string = tmp.getString();
Expression exp = null;
if (string.startsWith("\"") && string.endsWith("\"")) {
string = string.substring(1, string.length() - 1);
}
if (string.startsWith(Symbols.variablesPrefix)) {
exp = new Variable(string.replace(Symbols.variablesPrefix, ""));
} else {
exp = new LiteralExpression(new PlainLiteralImpl(string));
}
return (T) new ClerezzaSparqlObject(exp);
}
use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class ClerezzaTripleCallback method triple.
private void triple(String s, String p, String value, String datatype, String language, String graph) {
final BlankNodeOrIRI subject = getBlankNodeOrIRI(s);
final IRI predicate = new IRI(p);
RDFTerm object;
if (language != null) {
object = new PlainLiteralImpl(value, new Language(language));
} else if (datatype == null || RDF_LANG_STRING.equals(datatype)) {
object = new PlainLiteralImpl(value);
} else {
object = new TypedLiteralImpl(value, new IRI(datatype));
}
mGraph.add(new TripleImpl(subject, predicate, object));
}
Aggregations