use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class KeywordLinkingEngine method writeEnhancements.
/**
* Writes the Enhancements for the {@link LinkedEntity LinkedEntities}
* extracted from the parsed ContentItem
* @param ci
* @param linkedEntities
* @param language
*/
private void writeEnhancements(ContentItem ci, Collection<LinkedEntity> linkedEntities, String language) {
Language languageObject = null;
if (language != null && !language.isEmpty()) {
languageObject = new Language(language);
}
Graph metadata = ci.getMetadata();
for (LinkedEntity linkedEntity : linkedEntities) {
Collection<IRI> textAnnotations = new ArrayList<IRI>(linkedEntity.getOccurrences().size());
//first create the TextAnnotations for the Occurrences
for (Occurrence occurrence : linkedEntity.getOccurrences()) {
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
textAnnotations.add(textAnnotation);
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_START, literalFactory.createTypedLiteral(occurrence.getStart())));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_END, literalFactory.createTypedLiteral(occurrence.getEnd())));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occurrence.getContext(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occurrence.getSelectedText(), languageObject)));
metadata.add(new TripleImpl(textAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(linkedEntity.getScore())));
for (IRI dcType : linkedEntity.getTypes()) {
metadata.add(new TripleImpl(textAnnotation, Properties.DC_TYPE, dcType));
}
}
//now the EntityAnnotations for the Suggestions
for (Suggestion suggestion : linkedEntity.getSuggestions()) {
IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
//should we use the label used for the match, or search the
//representation for the best label ... currently its the matched one
Text label = suggestion.getBestLabel(linkerConfig.getNameField(), language);
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_LABEL, label.getLanguage() == null ? new PlainLiteralImpl(label.getText()) : new PlainLiteralImpl(label.getText(), new Language(label.getLanguage()))));
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_REFERENCE, new IRI(suggestion.getRepresentation().getId())));
Iterator<Reference> suggestionTypes = suggestion.getRepresentation().getReferences(linkerConfig.getTypeField());
while (suggestionTypes.hasNext()) {
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_ENTITY_TYPE, new IRI(suggestionTypes.next().getReference())));
}
metadata.add(new TripleImpl(entityAnnotation, Properties.ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
for (IRI textAnnotation : textAnnotations) {
metadata.add(new TripleImpl(entityAnnotation, Properties.DC_RELATION, textAnnotation));
}
//add the name of the ReferencedSite providing this suggestion
metadata.add(new TripleImpl(entityAnnotation, new IRI(RdfResourceEnum.site.getUri()), new PlainLiteralImpl(referencedSiteName)));
//add the RDF data for entities
if (dereferenceEntitiesState) {
metadata.addAll(RdfValueFactory.getInstance().toRdfRepresentation(suggestion.getRepresentation()).getRdfGraph());
}
}
}
}
use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class KuromojiNlpEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
//start with the Tokenizer
TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
//build the analyzing chain by adding all TokenFilters
for (TokenFilterFactory filterFactory : filterFactories) {
tokenStream = filterFactory.create(tokenStream);
}
//Try to extract sentences based on POS tags ...
int sentStartOffset = -1;
//NER data
List<NerData> nerList = new ArrayList<NerData>();
//the next index where the NerData.context need to be set
int nerSentIndex = 0;
NerData ner = null;
OffsetAttribute offset = null;
try {
//required with Solr 4
tokenStream.reset();
while (tokenStream.incrementToken()) {
offset = tokenStream.addAttribute(OffsetAttribute.class);
Token token = at.addToken(offset.startOffset(), offset.endOffset());
//Get the POS attribute and init the PosTag
PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = adhocTags.get(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = new PosTag(posAttr.getPartOfSpeech());
adhocTags.put(posAttr.getPartOfSpeech(), posTag);
log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
}
}
//Sentence detection by POS tag
if (sentStartOffset < 0) {
//the last token was a sentence ending
sentStartOffset = offset.startOffset();
}
if (posTag.hasPos(Pos.Point)) {
Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
//add the sentence as context to the NerData instances
while (nerSentIndex < nerList.size()) {
nerList.get(nerSentIndex).context = sent.getSpan();
nerSentIndex++;
}
sentStartOffset = -1;
}
//POS
token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
//NER
NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
//write NER annotation
Chunk chunk = at.addChunk(ner.start, ner.end);
chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
//NOTE that the fise:TextAnnotation are written later based on the nerList
//clean up
ner = null;
}
if (nerTag != null) {
if (ner == null) {
ner = new NerData(nerTag, offset.startOffset());
nerList.add(ner);
}
ner.end = offset.endOffset();
}
BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
MorphoFeatures morpho = null;
if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
//and add the posTag
morpho.addPos(posTag);
}
InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
inflectionAttr.getInflectionForm();
inflectionAttr.getInflectionType();
if (morpho != null) {
//if present add the morpho
token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
}
}
//we still need to write the last sentence
Sentence lastSent = null;
if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
lastSent = at.addSentence(sentStartOffset, offset.endOffset());
}
//and set the context off remaining named entities
while (nerSentIndex < nerList.size()) {
if (lastSent != null) {
nerList.get(nerSentIndex).context = lastSent.getSpan();
} else {
//no sentence detected
nerList.get(nerSentIndex).context = at.getSpan();
}
nerSentIndex++;
}
} catch (IOException e) {
throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
} finally {
try {
tokenStream.close();
} catch (IOException e) {
/* ignore */
}
}
//finally write the NER annotations to the metadata of the ContentItem
final Graph metadata = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
Language lang = new Language("ja");
for (NerData nerData : nerList) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
}
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class UserResource method addRole.
/**
* Add a role to a given user in system graph
*
* @param userNode node corresponding to user
* @param roleName name of the role
* @return user node
*/
private GraphNode addRole(GraphNode userNode, String roleName) {
// is this thing already around? (will be a bnode)
GraphNode roleNode = getTitleNode(roleName);
// otherwise make a new one as a named node
if (roleNode == null) {
IRI roleIRI = new IRI(rolesBase + roleName);
roleNode = new GraphNode(roleIRI, systemGraph);
roleNode.addProperty(RDF.type, PERMISSION.Role);
roleNode.addProperty(DC.title, new PlainLiteralImpl(roleName));
userNode.addProperty(SIOC.has_function, roleIRI);
} else {
userNode.addProperty(SIOC.has_function, roleNode.getNode());
}
return userNode;
}
use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class UserResource method createUser.
/**
* Creates a new user withe the specified user name
*
* @param newUserName
* @return user node in system graph
*/
private GraphNode createUser(String newUserName) {
BlankNode subject = new BlankNode();
GraphNode userNode = new GraphNode(subject, systemGraph);
userNode.addProperty(RDF.type, FOAF.Agent);
userNode.addProperty(PLATFORM.userName, new PlainLiteralImpl(newUserName));
addRole(userNode, "BasePermissionsRole");
return userNode;
}
use of org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl in project stanbol by apache.
the class OpenCalaisEngine method createEnhancements.
/**
* This generates enhancement structures for the entities from OpenCalais
* and adds them to the content item's metadata.
* For each entity a TextAnnotation and an EntityAnnotation are created.
* An EntityAnnotation can relate to several TextAnnotations.
*
* @param occs a Collection of entity information
* @param ci the content item
*/
public void createEnhancements(Collection<CalaisEntityOccurrence> occs, ContentItem ci) {
LiteralFactory literalFactory = LiteralFactory.getInstance();
// used for plain literals representing parts fo the content
final Language language;
String langString = EnhancementEngineHelper.getLanguage(ci);
if (langString != null && !langString.isEmpty()) {
language = new Language(langString);
} else {
language = null;
}
//TODO create TextEnhancement (form, start, end, type?) and EntityAnnotation (id, name, type)
HashMap<RDFTerm, IRI> entityAnnotationMap = new HashMap<RDFTerm, IRI>();
for (CalaisEntityOccurrence occ : occs) {
IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
Graph model = ci.getMetadata();
model.add(new TripleImpl(textAnnotation, DC_TYPE, occ.type));
// for autotagger use the name instead of the matched term (that might be a pronoun!)
if (onlyNERMode) {
model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occ.name, language)));
} else {
model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occ.exact, language)));
}
model.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(occ.offset)));
model.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(occ.offset + occ.length)));
model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occ.context, language)));
//use the relevance as confidence
if (occ.relevance != null && Double.valueOf(0).compareTo(occ.relevance) <= 0) {
//we do not know if the relevance is available (may be NULL)
//or the relevance feature is activated (may be -1)
model.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(occ.relevance)));
}
//create EntityAnnotation only once but add a reference to the textAnnotation
if (entityAnnotationMap.containsKey(occ.id)) {
model.add(new TripleImpl(entityAnnotationMap.get(occ.id), DC_RELATION, textAnnotation));
} else {
if (onlyNERMode) {
// don't create Calais specific entity annotations; let the autotagger do its's own
// but add a pointer to the first text annotation with that name
entityAnnotationMap.put(occ.id, textAnnotation);
} else {
// IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(ci, this);
// entityAnnotationMap.put(occ.id, entityAnnotation);
// model.add(new TripleImpl(entityAnnotation, DC_RELATION, textAnnotation));
// model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, occ.name));
// model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, occ.type));
// model.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, occ.id));
}
}
}
}
Aggregations