use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class OpenCalaisEngine method createEnhancements.
/**
* This generates enhancement structures for the entities from OpenCalais
* and adds them to the content item's metadata.
* For each entity a TextAnnotation and an EntityAnnotation are created.
* An EntityAnnotation can relate to several TextAnnotations.
*
* @param occs a Collection of entity information
* @param ci the content item
*/
public void createEnhancements(Collection<CalaisEntityOccurrence> occs, ContentItem ci) {
LiteralFactory literalFactory = LiteralFactory.getInstance();
// used for plain literals representing parts fo the content
final Language language;
String langString = EnhancementEngineHelper.getLanguage(ci);
if (langString != null && !langString.isEmpty()) {
language = new Language(langString);
} else {
language = null;
}
// TODO create TextEnhancement (form, start, end, type?) and EntityAnnotation (id, name, type)
HashMap<RDFTerm, IRI> entityAnnotationMap = new HashMap<RDFTerm, IRI>();
for (CalaisEntityOccurrence occ : occs) {
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
Graph model = ci.getMetadata();
model.add(new TripleImpl(textAnnotation, DC_TYPE, occ.type));
// for autotagger use the name instead of the matched term (that might be a pronoun!)
if (onlyNERMode) {
model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occ.name, language)));
} else {
model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occ.exact, language)));
}
model.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(occ.offset)));
model.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(occ.offset + occ.length)));
model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occ.context, language)));
// use the relevance as confidence
if (occ.relevance != null && Double.valueOf(0).compareTo(occ.relevance) <= 0) {
// we do not know if the relevance is available (may be NULL)
// or the relevance feature is activated (may be -1)
model.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(occ.relevance)));
}
// create EntityAnnotation only once but add a reference to the textAnnotation
if (entityAnnotationMap.containsKey(occ.id)) {
model.add(new TripleImpl(entityAnnotationMap.get(occ.id), DC_RELATION, textAnnotation));
} else {
if (onlyNERMode) {
// don't create Calais specific entity annotations; let the autotagger do its's own
// but add a pointer to the first text annotation with that name
entityAnnotationMap.put(occ.id, textAnnotation);
} else {
// IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
// entityAnnotationMap.put(occ.id, entityAnnotation);
// model.add(new TripleImpl(entityAnnotation, DC_RELATION, textAnnotation));
// model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, occ.name));
// model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, occ.type));
// model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, occ.id));
}
}
}
}
use of org.apache.clerezza.commons.rdf.Language in project stanbol by apache.
the class RdfRepresentationTest method testPlainLiteralToTextConversion.
/*--------------------------------------------------------------------------
* Additional Tests for special Features of the Clerezza based implementation
*
* This includes mainly support for additional types like PlainLiteral,
* TypedLiteral, IRIs. The conversion to such types as well as getter for
* such types.
*--------------------------------------------------------------------------
*/
/**
* {@link PlainLiteral} is used for natural language text in the Clerezza
* RDF API. This tests if adding {@link PlainLiteral}s to the
* {@link Representation#add(String, Object)} method makes them available
* as {@link Text} instances via the {@link Representation} API (e.g.
* {@link Representation#get(String, String...)}).
*/
@Test
public void testPlainLiteralToTextConversion() {
String field = "urn:test.RdfRepresentation:test.field";
Literal noLangLiteral = new PlainLiteralImpl("A plain literal without Language");
Literal enLiteral = new PlainLiteralImpl("An english literal", new Language("en"));
Literal deLiteral = new PlainLiteralImpl("Ein Deutsches Literal", new Language("de"));
Literal deATLiteral = new PlainLiteralImpl("Ein Topfen Verband hilft bei Zerrungen", new Language("de-AT"));
Collection<Literal> plainLiterals = Arrays.asList(noLangLiteral, enLiteral, deLiteral, deATLiteral);
Representation rep = createRepresentation(null);
rep.add(field, plainLiterals);
// now test, that the Plain Literals are available as natural language
// tests via the Representation Interface!
// 1) one without a language
Iterator<Text> noLangaugeTexts = rep.get(field, (String) null);
assertTrue(noLangaugeTexts.hasNext());
Text noLanguageText = noLangaugeTexts.next();
assertEquals(noLangLiteral.getLexicalForm(), noLanguageText.getText());
assertNull(noLanguageText.getLanguage());
// only a single result
assertFalse(noLangaugeTexts.hasNext());
// 2) one with a language
Iterator<Text> enLangaugeTexts = rep.get(field, "en");
assertTrue(enLangaugeTexts.hasNext());
Text enLangageText = enLangaugeTexts.next();
assertEquals(enLiteral.getLexicalForm(), enLangageText.getText());
assertEquals(enLiteral.getLanguage().toString(), enLangageText.getLanguage());
// only a single result
assertFalse(enLangaugeTexts.hasNext());
// 3) test to get all natural language values
Set<String> stringValues = new HashSet<String>();
for (Literal plainLiteral : plainLiterals) {
stringValues.add(plainLiteral.getLexicalForm());
}
Iterator<Text> texts = rep.getText(field);
while (texts.hasNext()) {
assertTrue(stringValues.remove(texts.next().getText()));
}
assertTrue(stringValues.isEmpty());
}
Aggregations