use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class LanguageDetectionEnhancementEngine method computeEnhancements.
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text = "";
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
//do not call trim() on long texts to check if the text is empty
if (text.length() < 50 && text.trim().length() == 0) {
log.info("No text contained in ContentPart {} of ContentItem {}", contentPart.getKey(), ci.getUri());
return;
}
// truncate text to some piece from the middle if probeLength > 0
int checkLength = probeLength;
if (checkLength > 0 && text.length() > checkLength) {
text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
}
List<Language> languages = null;
try {
languages = languageIdentifier.getLanguages(text);
log.debug("language identified: {}", languages);
} catch (LangDetectException e) {
Enum<?> errorCode = e.getCode();
//ignore " 0 - NoTextError" and "5 - CantDetectError"
if (errorCode.ordinal() != 0 && errorCode.ordinal() != 5) {
StringBuilder msg = new StringBuilder("Could not identify language of text: ");
if (text.length() < 200) {
msg.append(text);
} else {
msg.append(text.subSequence(0, 199)).append("...");
}
msg.append(" (Error Code: ").append(errorCode.ordinal()).append(" - ").append(errorCode.name()).append(")");
throw new EngineException(this, ci, msg.toString(), e);
} else {
log.debug("No text to detect the language from present in ContentItem ", ci);
}
}
// add language to metadata
if (languages != null) {
Graph g = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
for (int i = 0; i < maxSuggestedLanguages && i < languages.size(); i++) {
// add a hypothesis
Language hypothesis = languages.get(i);
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(hypothesis.lang)));
g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
}
} finally {
ci.getLock().writeLock().unlock();
}
}
}
use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class RdfSerializingWriter method getRecipe.
private GraphNode getRecipe(String templatePath) {
Graph rg = recipesGraphProvider.getRecipesGraph();
GraphNode literalNode = new GraphNode(new PlainLiteralImpl(templatePath), rg);
Iterator<GraphNode> recipes = literalNode.getSubjectNodes(RECIPES.recipeDomain);
if (recipes.hasNext()) {
return recipes.next();
} else {
return null;
}
}
use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class TopicClassificationEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
}
String language = EnhancementEngineHelper.getLanguage(ci);
if (!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) || acceptedLanguageSet.contains(""))) {
throw new IllegalStateException("The language '" + language + "' of the ContentItem is not configured as " + " active for this Engine (active: " + acceptedLanguageSet + ").");
}
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(String.format("Unable to extract " + " textual content from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
}
if (text.trim().isEmpty()) {
log.warn("ContentPart {} of ContentItem {} does not contain any " + "text to extract topics from", contentPart.getKey(), ci.getUri());
return;
}
Graph metadata = ci.getMetadata();
List<TopicSuggestion> topics;
try {
topics = suggestTopics(text);
if (topics.isEmpty()) {
return;
}
} catch (ClassifierException e) {
throw new EngineException(e);
}
IRI precision = new IRI(NamespaceEnum.fise + "classifier/precision");
IRI recall = new IRI(NamespaceEnum.fise + "classifier/recall");
IRI f1 = new IRI(NamespaceEnum.fise + "classifier/f1");
LiteralFactory lf = LiteralFactory.getInstance();
ci.getLock().writeLock().lock();
try {
// Global text annotation to attach all the topic annotation to it.
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textAnnotation, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE, OntologicalClasses.SKOS_CONCEPT));
for (TopicSuggestion topic : topics) {
IRI enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this);
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TOPICANNOTATION));
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION, textAnnotation));
// add link to entity
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, new IRI(topic.conceptUri)));
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE, OntologicalClasses.SKOS_CONCEPT));
// add confidence information
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf.createTypedLiteral(Double.valueOf(topic.score))));
// add performance estimates of the classifier if available
ClassificationReport perf = getPerformanceEstimates(topic.conceptUri);
if (perf.uptodate) {
metadata.add(new TripleImpl(enhancement, precision, lf.createTypedLiteral(Double.valueOf(perf.precision))));
metadata.add(new TripleImpl(enhancement, recall, lf.createTypedLiteral(Double.valueOf(perf.recall))));
metadata.add(new TripleImpl(enhancement, f1, lf.createTypedLiteral(Double.valueOf(perf.f1))));
}
// fetch concept label from the entityhub or a referenced site if available
Entity entity = entityhub.getEntity(topic.conceptUri);
if (entity == null) {
entity = referencedSiteManager.getEntity(topic.conceptUri);
}
if (entity != null) {
Representation representation = entity.getRepresentation();
// TODO: extract all languages based on some configuration instead of hardcoding English
Text label = representation.getFirst(NamespaceEnum.skos + "prefLabel", "en", "en-US", "en-GB");
if (label == null) {
label = representation.getFirst(NamespaceEnum.rdfs + "label", "en", "en-US", "en-GB");
}
if (label != null) {
metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(label.getText())));
}
}
}
} catch (ClassifierException e) {
throw new EngineException(e);
} catch (IllegalArgumentException e) {
throw new EngineException(e);
} catch (EntityhubException e) {
throw new EngineException(e);
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class Mapping method toResource.
/**
* Converts the parsed value based on the mapping information to an RDF
* {@link RDFTerm}. Optionally supports also validation if the parsed
* value is valid for the {@link Mapping#ontType ontology type} specified by
* the parsed mapping.
* @param value the value
* @param mapping the mapping
* @param validate
* @return the {@link RDFTerm} or <code>null</code> if the parsed value is
* <code>null</code> or {@link String#isEmpty() empty}.
* @throws IllegalArgumentException if the parsed {@link Mapping} is
* <code>null</code>
*/
protected RDFTerm toResource(String value, boolean validate) {
//used for date validation
Metadata dummy = null;
if (value == null || value.isEmpty()) {
//ignore null and empty values
return null;
}
RDFTerm object;
if (ontType == null) {
object = new PlainLiteralImpl(value);
} else if (ontType == RDFS.Resource) {
try {
if (validate) {
new URI(value);
}
object = new IRI(value);
} catch (URISyntaxException e) {
log.warn("Unable to create Reference for value {} (not a valid URI)" + " -> create a literal instead", value);
object = new PlainLiteralImpl(value);
}
} else {
//typed literal
Class<?> clazz = Mapping.ONT_TYPE_MAP.get(ontType);
if (clazz.equals(Date.class)) {
//parseDate(..) method
if (dummy == null) {
dummy = new Metadata();
}
//any Property with the Date type could be used here
dummy.add(DATE.getName(), value);
//access parseDate(..)
Date date = dummy.getDate(DublinCore.DATE);
if (date != null) {
//now use the Clerezza Literal factory
object = lf.createTypedLiteral(date);
} else {
//fall back to xsd:string
object = new TypedLiteralImpl(value, XSD.string);
}
} else {
object = new TypedLiteralImpl(value, ontType);
}
if (validate && clazz != null && !clazz.equals(Date.class)) {
//we need not to validate dates
try {
lf.createObject(clazz, (Literal) object);
} catch (NoConvertorException e) {
log.info("Unable to validate typed literals of type {} because" + "there is no converter for Class {} registered with Clerezza", ontType, clazz);
} catch (InvalidLiteralTypeException e) {
log.info("The value '{}' is not valid for dataType {}!" + "create literal with type 'xsd:string' instead", value, ontType);
object = new TypedLiteralImpl(value, XSD.string);
}
}
//else no validation needed
}
if (converter != null) {
object = converter.convert(object);
}
return object;
}
use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class RestfulLangidentEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(final ContentItem ci) throws EngineException {
//get the plain text Blob
Map.Entry<IRI, Blob> textBlob = getPlainText(this, ci, false);
Blob blob = textBlob.getValue();
//send the text to the server
final HttpPost request = new HttpPost(serviceUrl);
request.setEntity(new InputStreamEntity(blob.getStream(), blob.getContentLength(), ContentType.create(blob.getMimeType(), blob.getParameter().get("charset"))));
//execute the request
List<LangSuggestion> detected;
try {
detected = AccessController.doPrivileged(new PrivilegedExceptionAction<List<LangSuggestion>>() {
public List<LangSuggestion> run() throws ClientProtocolException, IOException {
return httpClient.execute(request, new LangIdentResponseHandler(ci, objectMapper));
}
});
} catch (PrivilegedActionException pae) {
Exception e = pae.getException();
if (e instanceof ClientProtocolException) {
throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful Language Identification Service at " + serviceUrl, e);
} else if (e instanceof IOException) {
throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful Language Identification Service at " + serviceUrl, e);
} else {
throw RuntimeException.class.cast(e);
}
}
Graph metadata = ci.getMetadata();
log.debug("Detected Languages for ContentItem {} and Blob {}");
ci.getLock().writeLock().lock();
try {
//write TextAnnotations for the detected languages
for (LangSuggestion suggestion : detected) {
// add a hypothesis
log.debug(" > {}@{}", suggestion.getLanguage(), suggestion.hasProbability() ? suggestion.getProbability() : "-,--");
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(suggestion.getLanguage())));
metadata.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
if (suggestion.hasProbability()) {
metadata.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getProbability())));
}
}
} finally {
ci.getLock().writeLock().unlock();
}
}
Aggregations