use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.
the class MultiThreadedTestBase method createRdfDataIterator.
/**
* Iterator implementation that parses an RDF graph from the parsed
* {@link InputStream}. The RDF data are loaded in-memory. Because of this
* only test data that fit in-memory can be used. <p>
* Literal values (objects) of the {@link #PROPERTY_TEST_DATA_PROPERTY} are
* used as data. If this property is not present {@link #DEFAULT_TEST_DATA_PROPERTY}
* is used. If {@link #PROPERTY_TEST_DATA_PROPERTY} is set to '*' than all
* Triples with Literal values are used.<p>
* This supports all RDF-formats supported by the {@link JenaParserProvider} and
* {@link RdfJsonParsingProvider}. The charset is expected to be UTF-8.
* @param is the input stream providing the RDF test data.
* @param mediaType the Media-Type of the stream. MUST BE supported by
* the Apache Clerezza RDF parsers.
*/
private Iterator<String> createRdfDataIterator(InputStream is, String mediaType, final String propertyString) {
final SimpleGraph graph = new SimpleGraph();
try {
rdfParser.parse(graph, is, mediaType);
} catch (UnsupportedFormatException e) {
Assert.fail("The MimeType '" + mediaType + "' of the parsed testData " + "is not supported. This utility supports plain text files as " + "as well as the RDF formats " + rdfParser.getSupportedFormats() + "If your test data uses one of those formats but it was not " + "correctly detected you can use the System property '" + PROPERTY_TEST_DATA_TYPE + "' to manually parse the Media-Type!");
}
IOUtils.closeQuietly(is);
return new Iterator<String>() {
Iterator<Triple> it = null;
String next = null;
private String getNext() {
if (it == null) {
IRI property;
if ("*".equals(propertyString)) {
//wildcard
property = null;
log.info("Iterate over values of all Triples");
} else {
property = new IRI(NamespaceMappingUtils.getConfiguredUri(nsPrefixService, propertyString));
log.info("Iterate over values of property {}", property);
}
it = graph.filter(null, property, null);
}
while (it.hasNext()) {
RDFTerm value = it.next().getObject();
if (value instanceof Literal) {
return ((Literal) value).getLexicalForm();
}
}
//no more data
return null;
}
@Override
public boolean hasNext() {
if (next == null) {
next = getNext();
}
return next != null;
}
@Override
public String next() {
if (next == null) {
next = getNext();
}
if (next == null) {
throw new NoSuchElementException("No further testData available");
} else {
String elem = next;
next = null;
return elem;
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.
the class IndexedGraphTest method createGraph.
private static void createGraph(Collection<Triple> tc, int triples, Long seed) {
Random rnd = new Random();
if (seed != null) {
rnd.setSeed(seed);
}
LiteralFactory lf = LiteralFactory.getInstance();
//randoms are in the range [0..3]
//literal
double l = 1.0;
//int
double i = l / 3;
//double
double d = l * 2 / 3;
//bNode
double b = 2.0;
//create new bNode
double nb = b - (l * 2 / 3);
double random;
BlankNodeOrIRI subject = null;
IRI predicate = null;
List<IRI> predicateList = new ArrayList<IRI>();
predicateList.add(RDF.first);
predicateList.add(RDF.rest);
predicateList.add(RDF.type);
predicateList.add(RDFS.label);
predicateList.add(RDFS.comment);
predicateList.add(RDFS.range);
predicateList.add(RDFS.domain);
predicateList.add(FOAF.name);
predicateList.add(FOAF.nick);
predicateList.add(FOAF.homepage);
predicateList.add(FOAF.age);
predicateList.add(FOAF.depiction);
String URI_PREFIX = "http://www.test.org/bigGraph/ref";
Language DE = new Language("de");
Language EN = new Language("en");
Iterator<IRI> predicates = predicateList.iterator();
List<BlankNode> bNodes = new ArrayList<BlankNode>();
bNodes.add(new BlankNode());
for (int count = 0; tc.size() < triples; count++) {
random = rnd.nextDouble() * 3;
if (random >= 2.5 || count == 0) {
if (random <= 2.75) {
subject = new IRI(URI_PREFIX + count);
} else {
int rndIndex = (int) ((random - 2.75) * bNodes.size() / (3.0 - 2.75));
subject = bNodes.get(rndIndex);
}
}
if (random > 2.0 || count == 0) {
if (!predicates.hasNext()) {
Collections.shuffle(predicateList, rnd);
predicates = predicateList.iterator();
}
predicate = predicates.next();
}
if (random <= l) {
//literal
if (random <= i) {
tc.add(new TripleImpl(subject, predicate, lf.createTypedLiteral(count)));
} else if (random <= d) {
tc.add(new TripleImpl(subject, predicate, lf.createTypedLiteral(random)));
} else {
Literal text;
if (random <= i) {
text = new PlainLiteralImpl("Literal for " + count);
} else if (random <= d) {
text = new PlainLiteralImpl("An English literal for " + count, EN);
} else {
text = new PlainLiteralImpl("Ein Deutsches Literal für " + count, DE);
}
tc.add(new TripleImpl(subject, predicate, text));
}
} else if (random <= b) {
//bnode
BlankNode bnode;
if (random <= nb) {
bnode = new BlankNode();
bNodes.add(bnode);
} else {
//>nb <b
int rndIndex = (int) ((random - nb) * bNodes.size() / (b - nb));
bnode = bNodes.get(rndIndex);
}
tc.add(new TripleImpl(subject, predicate, bnode));
} else {
//IRI
tc.add(new TripleImpl(subject, predicate, new IRI(URI_PREFIX + count * random)));
}
}
}
use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.
the class EntityCoMentionEngine method writeComentions.
private void writeComentions(ContentItem ci, Collection<LinkedEntity> comentions, String language, Set<IRI> textAnnotations) {
Language languageObject = null;
if (language != null && !language.isEmpty()) {
languageObject = new Language(language);
}
Graph metadata = ci.getMetadata();
//we MUST adjust the confidence level of existing annotations only once
//se we need to keep track of those
Set<BlankNodeOrIRI> adjustedSuggestions = new HashSet<BlankNodeOrIRI>();
log.debug("Write Co-Mentions:");
for (LinkedEntity comention : comentions) {
log.debug(" > {}", comention);
//URIs of TextAnnotations for the initial mention of this co-mention
Collection<IRI> initialMentions = new ArrayList<IRI>(comention.getSuggestions().size());
for (Suggestion suggestion : comention.getSuggestions()) {
Entity entity = suggestion.getEntity();
if (textAnnotations.contains(entity.getUri())) {
// if(entity.getData().filter(entity.getUri(),RDF_TYPE,ENHANCER_TEXTANNOTATION).hasNext()){
//this is a textAnnotation
initialMentions.add(entity.getUri());
}
//else TODO support also Entities!!
}
//create the TextAnnotations for the co-mention
for (Occurrence occurrence : comention.getOccurrences()) {
Literal startLiteral = literalFactory.createTypedLiteral(occurrence.getStart());
Literal endLiteral = literalFactory.createTypedLiteral(occurrence.getEnd());
//search for existing text annotation
boolean ignore = false;
//search for textAnnotations with the same end
IRI textAnnotation = null;
Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
while (it.hasNext()) {
Triple t = it.next();
Integer end = EnhancementEngineHelper.get(metadata, t.getSubject(), ENHANCER_END, Integer.class, literalFactory);
if (end != null && textAnnotations.contains(t.getSubject())) {
//metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
textAnnotation = (IRI) t.getSubject();
if (end > occurrence.getEnd()) {
// there is an other TextAnnotation selecting a bigger Span
//so we should ignore this Occurrence
ignore = true;
}
}
}
it = metadata.filter(null, ENHANCER_END, endLiteral);
while (it.hasNext()) {
Triple t = it.next();
Integer start = EnhancementEngineHelper.get(metadata, t.getSubject(), ENHANCER_START, Integer.class, literalFactory);
if (start != null && textAnnotations.contains(t.getSubject())) {
//metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()){
textAnnotation = (IRI) t.getSubject();
if (start < occurrence.getStart()) {
// there is an other TextAnnotation selecting a bigger Span
//so we should ignore this Occurrence
ignore = true;
}
}
}
if (!ignore) {
//collect confidence values of co-mentions
//maximum confidence of suggestions of the initial mention
Double maxConfidence = null;
//maximum confidence of existing suggestions
Double maxExistingConfidence = null;
if (textAnnotation == null) {
//not found ... create a new TextAnnotation for the co-mention
textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
//add it to the set of TextAnnotations
textAnnotations.add(textAnnotation);
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.getContext(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occurrence.getSelectedText(), languageObject)));
} else {
//if existing add this engine as contributor
metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
//maxConfidence = EnhancementEngineHelper.get(metadata, textAnnotation,
// ENHANCER_CONFIDENCE, Double.class, literalFactory);
}
//now process initial mention(s) for the co-mention
Set<IRI> dcTypes = new HashSet<IRI>();
for (IRI initialMention : initialMentions) {
//get the dc:type(s) of the initial mentions
Iterator<IRI> dcTypesIt = getReferences(metadata, initialMention, DC_TYPE);
while (dcTypesIt.hasNext()) {
dcTypes.add(dcTypesIt.next());
}
//check confidence of the initial mention (fise:TextAnnotation)
Double confidnece = EnhancementEngineHelper.get(metadata, initialMention, ENHANCER_CONFIDENCE, Double.class, literalFactory);
if (confidnece != null) {
if (maxConfidence == null) {
maxConfidence = confidnece;
} else if (maxConfidence.compareTo(confidnece) <= 0) {
maxConfidence = confidnece;
}
}
//else nothing to do
//now we need to compare the suggestions of the initial
//mention(s) with the existing one.
//Get information about the suggestions of the initial mention
Map<RDFTerm, Double> initialSuggestions = new HashMap<RDFTerm, Double>();
Map<RDFTerm, RDFTerm> initialSuggestedEntities = new HashMap<RDFTerm, RDFTerm>();
for (Iterator<Triple> suggestions = metadata.filter(null, DC_RELATION, initialMention); suggestions.hasNext(); ) {
if (!textAnnotations.contains(suggestions)) {
BlankNodeOrIRI suggestion = suggestions.next().getSubject();
RDFTerm suggestedEntity = EnhancementEngineHelper.getReference(metadata, suggestion, ENHANCER_ENTITY_REFERENCE);
if (suggestedEntity != null) {
//it has a suggestion
Double confidence = EnhancementEngineHelper.get(metadata, suggestion, ENHANCER_CONFIDENCE, Double.class, literalFactory);
if (maxConfidence == null) {
maxConfidence = confidence;
} else if (confidnece != null && maxConfidence.compareTo(confidnece) <= 0) {
maxConfidence = confidnece;
}
//else nothing to do
initialSuggestions.put(suggestion, confidence);
initialSuggestedEntities.put(suggestedEntity, suggestion);
}
//no suggestion (dc:relation to some other resource)
}
// else ignore dc:relation to other fise:TextAnnotations
}
//now we collect existing Suggestions for this TextAnnoation where we need
//to adjust the confidence (quite some things to check ....)
Map<BlankNodeOrIRI, Double> existingSuggestions = new HashMap<BlankNodeOrIRI, Double>();
if (maxConfidence != null && confidenceAdjustmentFactor < 1) {
//suggestions are defined by incoming dc:releation
for (Iterator<Triple> esIt = metadata.filter(null, DC_RELATION, textAnnotation); esIt.hasNext(); ) {
BlankNodeOrIRI existingSuggestion = esIt.next().getSubject();
//but not all of them are suggestions
if (!textAnnotations.contains(existingSuggestion)) {
//ignore fise:TextAnnotations
Double existingConfidence = EnhancementEngineHelper.get(metadata, existingSuggestion, ENHANCER_CONFIDENCE, Double.class, literalFactory);
//ignore fise:TextAnnotations also suggested for the initial mention
if (!initialSuggestions.containsKey(existingSuggestion)) {
RDFTerm suggestedEntity = EnhancementEngineHelper.getReference(metadata, existingSuggestion, ENHANCER_ENTITY_REFERENCE);
//suggestions for the initial mention
if (!initialSuggestedEntities.containsKey(suggestedEntity)) {
//finally make sure that we adjust confidences only once
if (!adjustedSuggestions.contains(existingSuggestion)) {
existingSuggestions.put(existingSuggestion, existingConfidence);
}
//else confidence already adjusted
} else {
// different fise:EntityAnnotation, but same reference Entity
//we need to check confidences to decide what to do
RDFTerm initialSuggestion = initialSuggestedEntities.get(suggestedEntity);
Double initialConfidence = initialSuggestions.get(initialSuggestion);
if (initialConfidence == null || (existingConfidence != null && existingConfidence.compareTo(initialConfidence) >= 0)) {
//existing confidence >= initial .. keep existing
initialSuggestions.remove(initialSuggestion);
if (maxExistingConfidence == null) {
maxExistingConfidence = existingConfidence;
} else if (maxExistingConfidence.compareTo(existingConfidence) <= 0) {
maxExistingConfidence = existingConfidence;
}
} else {
//adjust this one (if not yet adjusted)
if (!adjustedSuggestions.contains(existingSuggestion)) {
existingSuggestions.put(existingSuggestion, existingConfidence);
}
}
}
} else {
//a initial mention already present
//no need to process initial mention
initialSuggestions.remove(existingSuggestion);
if (maxExistingConfidence == null) {
maxExistingConfidence = existingConfidence;
} else if (existingConfidence != null && maxExistingConfidence.compareTo(existingConfidence) <= 0) {
maxExistingConfidence = existingConfidence;
}
//else maxExistingConfidence == null (undefined)
}
}
//else ignore dc:relations to other fise:TextAnnotations
}
for (Entry<BlankNodeOrIRI, Double> entry : existingSuggestions.entrySet()) {
if (entry.getValue() != null) {
double adjustedConfidence = entry.getValue() * confidenceAdjustmentFactor;
if (maxExistingConfidence == null || adjustedConfidence > maxExistingConfidence) {
maxExistingConfidence = adjustedConfidence;
}
EnhancementEngineHelper.set(metadata, entry.getKey(), ENHANCER_CONFIDENCE, adjustedConfidence, literalFactory);
//mark as adjusted
adjustedSuggestions.add(entry.getKey());
}
}
}
//add the suggestions of the initial mention to this one
for (RDFTerm suggestion : initialSuggestions.keySet()) {
metadata.add(new TripleImpl((BlankNodeOrIRI) suggestion, DC_RELATION, textAnnotation));
}
//finally link the co-mentation with the initial one
metadata.add(new TripleImpl(textAnnotation, DC_RELATION, initialMention));
//metadata.add(new TripleImpl(initialMention, DC_RELATION, textAnnotation));
}
// Adapt the dc:type values of the fise:TextAnnotation
// - if Suggestions added by this engine do have the max confidence
// use the dc:type values of the initial mention
// - if the original suggestions do have a higher confidence keep the
// existing
// - in case both do have the same confidence we add all dc:types
boolean removeExistingDcTypes = maxConfidence != null && (maxExistingConfidence == null || maxConfidence.compareTo(maxExistingConfidence) >= 0);
boolean addCoMentionDcTypes = maxExistingConfidence == null || (maxConfidence != null && maxConfidence.compareTo(maxExistingConfidence) >= 1);
Iterator<IRI> existingDcTypesIt = getReferences(metadata, textAnnotation, DC_TYPE);
while (existingDcTypesIt.hasNext()) {
//removeExistingDcTypes == true
if ((!dcTypes.remove(existingDcTypesIt.next()) || !addCoMentionDcTypes) && removeExistingDcTypes) {
//remove the dcType
existingDcTypesIt.remove();
}
}
if (addCoMentionDcTypes) {
for (IRI dcType : dcTypes) {
//add missing
metadata.add(new TripleImpl(textAnnotation, DC_TYPE, dcType));
}
}
//TODO: support also Entities
if (maxConfidence != null) {
//set the confidence value (if known)
EnhancementEngineHelper.set(metadata, textAnnotation, ENHANCER_CONFIDENCE, maxConfidence, literalFactory);
}
}
//else ignore this occurence
}
}
}
use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.
the class EntityLinkingEngine method writeEnhancements.
/**
* Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
* extracted from the parsed ContentItem
* @param ci
* @param linkedEntities
* @param language
*/
private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language, boolean writeRankings) {
Language languageObject = null;
if (language != null && !language.isEmpty()) {
languageObject = new Language(language);
}
Set<IRI> dereferencedEntitis = new HashSet<IRI>();
Graph metadata = ci.getMetadata();
for (LinkedEntity linkedEntity : linkedEntities) {
Collection<IRI> textAnnotations = new ArrayList<IRI>(linkedEntity.getOccurrences().size());
//first create the TextAnnotations for the Occurrences
for (Occurrence occurrence : linkedEntity.getOccurrences()) {
Literal startLiteral = literalFactory.createTypedLiteral(occurrence.getStart());
Literal endLiteral = literalFactory.createTypedLiteral(occurrence.getEnd());
//search for existing text annotation
Iterator<Triple> it = metadata.filter(null, ENHANCER_START, startLiteral);
IRI textAnnotation = null;
while (it.hasNext()) {
Triple t = it.next();
if (metadata.filter(t.getSubject(), ENHANCER_END, endLiteral).hasNext() && metadata.filter(t.getSubject(), RDF_TYPE, ENHANCER_TEXTANNOTATION).hasNext()) {
textAnnotation = (IRI) t.getSubject();
break;
}
}
if (textAnnotation == null) {
//not found ... create a new one
textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, startLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, endLiteral));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.getContext(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occurrence.getSelectedText(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(linkedEntity.getScore())));
} else {
//if existing add this engine as contributor
metadata.add(new TripleImpl(textAnnotation, DC_CONTRIBUTOR, new PlainLiteralImpl(this.getClass().getName())));
}
//add dc:types (even to existing)
for (IRI dcType : linkedEntity.getTypes()) {
metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
}
textAnnotations.add(textAnnotation);
}
//now the EntityAnnotations for the Suggestions
for (Suggestion suggestion : linkedEntity.getSuggestions()) {
IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
//should we use the label used for the match, or search the
//representation for the best label ... currently its the matched one
Literal label = suggestion.getBestLabel(linkerConfig.getNameField(), language);
Entity entity = suggestion.getEntity();
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, label));
metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, entity.getUri()));
Iterator<IRI> suggestionTypes = entity.getReferences(linkerConfig.getTypeField());
while (suggestionTypes.hasNext()) {
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, suggestionTypes.next()));
}
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
for (IRI textAnnotation : textAnnotations) {
metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
}
//add origin information of the EntiySearcher
for (Entry<IRI, Collection<RDFTerm>> originInfo : entitySearcher.getOriginInformation().entrySet()) {
for (RDFTerm value : originInfo.getValue()) {
metadata.add(new TripleImpl(entityAnnotation, originInfo.getKey(), value));
}
}
if (writeRankings) {
Float ranking = suggestion.getEntity().getEntityRanking();
if (ranking != null) {
metadata.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_RANKING, //write the float as double
new TypedLiteralImpl(ranking.toString(), XSD_DOUBLE)));
}
}
//add the RDF data for entities
if (linkerConfig.isDereferenceEntitiesEnabled() && dereferencedEntitis.add(entity.getUri())) {
//NOTE: do not add all triples as there might be other data in the graph
for (Iterator<Triple> triples = entity.getData().filter(entity.getUri(), null, null); triples.hasNext(); metadata.add(triples.next())) ;
}
}
}
}
use of org.apache.clerezza.commons.rdf.Literal in project stanbol by apache.
the class InMemoryEntityIndex method addEntity.
public void addEntity(Entity entity) {
if (log.isDebugEnabled()) {
log.debug(" > register {}", entity);
}
entities.put(entity.getUri(), entity);
Iterator<Literal> labels = entity.getText(nameField);
while (labels.hasNext()) {
Literal label = labels.next();
String lang = label.getLanguage() == null ? null : label.getLanguage().toString();
if (indexLanguages.contains(lang)) {
for (String token : tokenizer.tokenize(label.getLexicalForm(), null)) {
token = token.toLowerCase(Locale.ROOT);
Collection<Entity> values = index.get(token);
if (values == null) {
values = new ArrayList<Entity>();
index.put(token, values);
}
values.add(entity);
}
}
//else ignore labels in other languages
}
}
Aggregations