use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class FstLinkingEngine method writeEnhancements.
/**
* Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
* extracted from the parsed ContentItem
* @param ci
* @param tags
* @param language
*/
private void writeEnhancements(ContentItem ci, String text, Collection<Tag> tags, String language, boolean writeRankings) {
Language languageObject = null;
if (language != null && !language.isEmpty()) {
languageObject = new Language(language);
}
Graph metadata = ci.getMetadata();
for (Tag tag : tags) {
Collection<IRI> textAnnotations = new ArrayList<IRI>(tags.size());
//first create the TextAnnotations for the Occurrences
Literal startLiteral = literalFactory.createTypedLiteral(tag.getStart());
Literal endLiteral = literalFactory.createTypedLiteral(tag.getEnd());
//search for existing text annotation
Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
IRI textAnnotation = null;
while (it.hasNext()) {
Triple t = it.next();
if (metadata.filter(t.getSubject(), ENHANCER_END, endLiteral).hasNext() && metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()) {
textAnnotation = (IRI) t.getSubject();
break;
}
}
if (textAnnotation == null) {
//not found ... create a new one
textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, tag.getAnchor(), tag.getStart()), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(tag.getAnchor(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(tag.getScore())));
} else {
//if existing add this engine as contributor
metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
}
//add dc:types (even to existing)
for (IRI dcType : getDcTypes(tag.getSuggestions())) {
metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
}
textAnnotations.add(textAnnotation);
//now the EntityAnnotations for the Suggestions
for (Match match : tag.getSuggestions()) {
IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
//should we use the label used for the match, or search the
//representation for the best label ... currently its the matched one
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, match.getMatchLabel()));
metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, new IRI(match.getUri())));
for (IRI type : match.getTypes()) {
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, type));
}
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(match.getScore())));
//add the relation to the fise:TextAnnotation (the tag)
metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
//write origin information
if (indexConfig.getOrigin() != null) {
metadata.add(new TripleImpl(entityAnnotation, FISE_ORIGIN, indexConfig.getOrigin()));
}
// }
if (writeRankings) {
Double ranking = match.getRanking();
if (ranking != null) {
metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_RANKING, literalFactory.createTypedLiteral(ranking)));
}
}
//TODO: dereferencing
// if(linkerConfig.isDereferenceEntitiesEnabled() &&
// dereferencedEntitis.add(entity.getUri())){ //not yet dereferenced
// //add all outgoing triples for this entity
// //NOTE: do not add all triples as there might be other data in the graph
// for(Iterator<Triple> triples = entity.getData().filter(entity.getUri(), null, null);
// triples.hasNext();metadata.add(triples.next()));
// }
}
}
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class TestHtmlExtractor method testMicrodataExtraction.
/** This test some extraction of microdata from an HTML-5 document
*
* @throws Exception
*/
@Test
public void testMicrodataExtraction() throws Exception {
HtmlExtractor extractor = new HtmlExtractor(registry, parser);
Graph model = new SimpleGraph();
String testFile = "test-microdata.html";
// extract text from RDFa annotated html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
extractor.extract("file://" + testFile, in, null, "text/html", model);
// show triples
int tripleCounter = model.size();
LOG.debug("Microdata triples: {}", tripleCounter);
printTriples(model);
assertEquals(91, tripleCounter);
ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS + "contains"));
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class TestHtmlExtractor method testMFExtraction.
/** This tests some Microformat extraction
*
* @throws ExtractorException if there is an error during extraction
* @throws IOException if there is an error when reading the document
*/
@Test
public void testMFExtraction() throws Exception {
HtmlExtractor extractor = new HtmlExtractor(registry, parser);
Graph model = new SimpleGraph();
String testFile = "test-MF.html";
// extract text from RDFa annotated html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
extractor.extract("file://" + testFile, in, null, "text/html", model);
// show triples
int tripleCounter = model.size();
LOG.debug("Microformat triples: {}", tripleCounter);
printTriples(model);
assertEquals(127, tripleCounter);
ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS + "contains"));
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class LangIdEnhancementEngine method computeEnhancements.
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text = "";
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
if (text.trim().length() == 0) {
log.info("No text contained in ContentPart {} of ContentItem {}", contentPart.getKey(), ci.getUri());
return;
}
// truncate text to some piece from the middle if probeLength > 0
int checkLength = probeLength;
if (checkLength > 0 && text.length() > checkLength) {
text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
}
LanguageIdentifier languageIdentifier = new LanguageIdentifier(text);
String language = languageIdentifier.getLanguage();
log.info("language identified as " + language);
// add language to metadata
Graph g = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(language)));
g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.
the class LanguageDetectionEnhancementEngine method computeEnhancements.
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text = "";
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
//do not call trim() on long texts to check if the text is empty
if (text.length() < 50 && text.trim().length() == 0) {
log.info("No text contained in ContentPart {} of ContentItem {}", contentPart.getKey(), ci.getUri());
return;
}
// truncate text to some piece from the middle if probeLength > 0
int checkLength = probeLength;
if (checkLength > 0 && text.length() > checkLength) {
text = text.substring(text.length() / 2 - checkLength / 2, text.length() / 2 + checkLength / 2);
}
List<Language> languages = null;
try {
languages = languageIdentifier.getLanguages(text);
log.debug("language identified: {}", languages);
} catch (LangDetectException e) {
Enum<?> errorCode = e.getCode();
//ignore " 0 - NoTextError" and "5 - CantDetectError"
if (errorCode.ordinal() != 0 && errorCode.ordinal() != 5) {
StringBuilder msg = new StringBuilder("Could not identify language of text: ");
if (text.length() < 200) {
msg.append(text);
} else {
msg.append(text.subSequence(0, 199)).append("...");
}
msg.append(" (Error Code: ").append(errorCode.ordinal()).append(" - ").append(errorCode.name()).append(")");
throw new EngineException(this, ci, msg.toString(), e);
} else {
log.debug("No text to detect the language from present in ContentItem ", ci);
}
}
// add language to metadata
if (languages != null) {
Graph g = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
for (int i = 0; i < maxSuggestedLanguages && i < languages.size(); i++) {
// add a hypothesis
Language hypothesis = languages.get(i);
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(hypothesis.lang)));
g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
g.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(hypothesis.prob)));
}
} finally {
ci.getLock().writeLock().unlock();
}
}
}
Aggregations