use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class CeliLanguageIdentifierEnhancementEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text = "";
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
return;
}
try {
String[] tmps = text.split(" ");
List<GuessedLanguage> lista = null;
if (tmps.length > 5)
lista = this.client.guessLanguage(text);
else
lista = this.client.guessQueryLanguage(text);
Graph g = ci.getMetadata();
//in ENHANCE_ASYNC we need to use read/write locks on the ContentItem
ci.getLock().writeLock().lock();
try {
GuessedLanguage gl = lista.get(0);
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(gl.getLang())));
g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(gl.getConfidence())));
g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
} finally {
ci.getLock().writeLock().unlock();
}
} catch (IOException e) {
throw new EngineException("Error while calling the CELI language" + " identifier service (configured URL: " + serviceURL + ")!", e);
} catch (SOAPException e) {
throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI language identifier service!", e);
}
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class DBPSpotlightDisambiguateEnhancementEngine method createEnhancements.
/**
* The method adds the returned DBpedia Spotlight annotations to the content
* item's metadata. For each DBpedia resource an EntityAnnotation is created
* and linked to the according TextAnnotation.
*
* @param occs
* a Collection of entity information
* @param ci
* the content item
*/
public void createEnhancements(Collection<Annotation> occs, ContentItem ci, Language language) {
HashMap<RDFTerm, IRI> entityAnnotationMap = new HashMap<RDFTerm, IRI>();
for (Annotation occ : occs) {
if (textAnnotationsMap.get(occ.surfaceForm) != null) {
IRI textAnnotation = textAnnotationsMap.get(occ.surfaceForm);
Graph model = ci.getMetadata();
IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
entityAnnotationMap.put(occ.uri, entityAnnotation);
Literal label = new PlainLiteralImpl(occ.surfaceForm.name, language);
model.add(new TripleImpl(entityAnnotation, DC_RELATION, textAnnotation));
model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, label));
Collection<String> t = occ.getTypeNames();
if (t != null) {
Iterator<String> it = t.iterator();
while (it.hasNext()) model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, new IRI(it.next())));
}
model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, occ.uri));
}
}
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class TestClerezzaInputSources method checkOntology.
private void checkOntology(boolean usesTcProvider) throws Exception {
assertNotNull(src);
if (usesTcProvider)
assertNotNull(src.getOrigin());
else
assertNull(src.getOrigin());
Graph o = src.getRootOntology();
assertNotNull(o);
log.info("Ontology loaded, is a {}", o.getClass().getCanonicalName());
// The owl:Ontology declaration and versionInfo also count as triples.
assertSame(5, o.size());
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class HtmlExtractorEngine method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
HtmlExtractor extractor = new HtmlExtractor(htmlExtractorRegistry, htmlParser);
Graph model = new SimpleGraph();
ci.getLock().readLock().lock();
try {
extractor.extract(ci.getUri().getUnicodeString(), ci.getStream(), null, ci.getMimeType(), model);
} catch (ExtractorException e) {
throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with HtmlExtractor", e);
} finally {
ci.getLock().readLock().unlock();
}
ClerezzaRDFUtils.urifyBlankNodes(model);
// make the model single rooted
if (singleRootRdf) {
ClerezzaRDFUtils.makeConnected(model, ci.getUri(), new IRI(NIE_NS + "contains"));
}
//add the extracted triples to the metadata of the ContentItem
ci.getLock().writeLock().lock();
try {
LOG.info("Model: {}", model);
ci.getMetadata().addAll(model);
model = null;
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class HtmlExtractor method main.
public static void main(String[] args) throws Exception {
int argv = 0;
HtmlExtractor inst = new HtmlExtractor();
for (int i = argv; i < args.length; ++i) {
File file = new File(args[i]);
InputStream input = new FileInputStream(file);
Charset charset = Charset.forName("UTF-8");
String mimeType = "text/html";
IRI uri = new IRI(file.toURI().toString());
Graph container = new SimpleGraph();
inst.extract(uri.getUnicodeString(), input, charset, mimeType, container);
System.out.println("Model for " + args[i]);
//TODO
// container.writeTo(System.out);
System.out.println();
}
}
Aggregations