use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.
the class EnhancementRDFUtils method writeEntityAnnotation.
/**
* @param literalFactory
* the LiteralFactory to use
* @param graph
* the Graph to use
* @param contentItemId
* the contentItemId the enhancement is extracted from
* @param relatedEnhancements
* enhancements this textAnnotation is related to
* @param suggestion
* the entity suggestion
* @param nameField the field used to extract the name
* @param lang the preferred language to include or <code>null</code> if none
*/
public static IRI writeEntityAnnotation(EnhancementEngine engine, LiteralFactory literalFactory, Graph graph, IRI contentItemId, Collection<BlankNodeOrIRI> relatedEnhancements, Suggestion suggestion, String nameField, String lang) {
Representation rep = suggestion.getEntity().getRepresentation();
// 1. extract the "best label"
//Start with the matched one
Text label = suggestion.getMatchedLabel();
//if the matched label is not in the requested language
boolean langMatch = (lang == null && label.getLanguage() == null) || (label.getLanguage() != null && label.getLanguage().startsWith(lang));
//search if a better label is available for this Entity
if (!langMatch) {
Iterator<Text> labels = rep.getText(nameField);
while (labels.hasNext() && !langMatch) {
Text actLabel = labels.next();
langMatch = (lang == null && actLabel.getLanguage() == null) || (actLabel.getLanguage() != null && actLabel.getLanguage().startsWith(lang));
if (langMatch) {
//if the language matches ->
//override the matched label
label = actLabel;
}
}
}
//else the matched label will be the best to use
Literal literal;
if (label.getLanguage() == null) {
literal = new PlainLiteralImpl(label.getText());
} else {
literal = new PlainLiteralImpl(label.getText(), new Language(label.getLanguage()));
}
// Now create the entityAnnotation
IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(graph, engine, contentItemId);
// first relate this entity annotation to the text annotation(s)
for (BlankNodeOrIRI enhancement : relatedEnhancements) {
graph.add(new TripleImpl(entityAnnotation, DC_RELATION, enhancement));
}
IRI entityUri = new IRI(rep.getId());
// add the link to the referred entity
graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, entityUri));
// add the label parsed above
graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, literal));
if (suggestion.getScore() != null) {
graph.add(new TripleImpl(entityAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
}
Iterator<Reference> types = rep.getReferences(RDF_TYPE.getUnicodeString());
while (types.hasNext()) {
graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, new IRI(types.next().getReference())));
}
//add the name of the ReferencedSite that manages the Entity
if (suggestion.getEntity().getSite() != null) {
graph.add(new TripleImpl(entityAnnotation, new IRI(RdfResourceEnum.site.getUri()), new PlainLiteralImpl(suggestion.getEntity().getSite())));
}
return entityAnnotation;
}
use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.
the class NamedEntityTaggingEngine method computeEntityRecommentations.
/**
* Computes the Enhancements
*
* @param site
* The {@link SiteException} id or <code>null</code> to use the {@link Entityhub}
* @param literalFactory
* the {@link LiteralFactory} used to create RDF Literals
* @param contentItemId
* the id of the contentItem
* @param textAnnotation
* the text annotation to enhance
* @param subsumedAnnotations
* other text annotations for the same entity
* @param language
* the language of the analysed text or <code>null</code> if not available.
* @return the suggestions for the parsed {@link NamedEntity}
* @throws EntityhubException
* On any Error while looking up Entities via the Entityhub
*/
protected final List<Suggestion> computeEntityRecommentations(Site site, NamedEntity namedEntity, List<IRI> subsumedAnnotations, String language) throws EntityhubException {
// First get the required properties for the parsed textAnnotation
// ... and check the values
log.debug("Process {}", namedEntity);
// if site is NULL use
// the Entityhub
FieldQueryFactory queryFactory = site == null ? entityhub.getQueryFactory() : site.getQueryFactory();
log.trace("Will use a query-factory of type [{}].", queryFactory.getClass().toString());
FieldQuery query = queryFactory.createFieldQuery();
// replace spaces with plus to create an AND search for all words in the
// name!
Constraint labelConstraint;
// TODO: make case sensitivity configurable
boolean casesensitive = false;
String namedEntityLabel = casesensitive ? namedEntity.getName() : namedEntity.getName().toLowerCase();
if (language != null) {
// search labels in the language and without language
labelConstraint = new TextConstraint(namedEntityLabel, casesensitive, language, null);
} else {
labelConstraint = new TextConstraint(namedEntityLabel, casesensitive);
}
query.setConstraint(nameField, labelConstraint);
if (OntologicalClasses.DBPEDIA_PERSON.equals(namedEntity.getType())) {
if (personState) {
if (personType != null) {
query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(personType));
}
// else no type constraint
} else {
// ignore people
return Collections.emptyList();
}
} else if (DBPEDIA_ORGANISATION.equals(namedEntity.getType())) {
if (orgState) {
if (orgType != null) {
query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(orgType));
}
// else no type constraint
} else {
// ignore people
return Collections.emptyList();
}
} else if (OntologicalClasses.DBPEDIA_PLACE.equals(namedEntity.getType())) {
if (this.placeState) {
if (this.placeType != null) {
query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(placeType));
}
// else no type constraint
} else {
// ignore people
return Collections.emptyList();
}
}
query.setLimit(Math.max(20, this.numSuggestions * 3));
log.trace("A query has been created of type [{}] and the following settings:\n{}", query.getClass().toString(), query.toString());
if (null == site)
log.trace("A query will be sent to the entity-hub of type [{}].", entityhub.getClass());
else
log.trace("A query will be sent to a site [id :: {}][type :: {}].", site.getId(), site.getClass());
QueryResultList<Entity> results = // if site is NULL
site == null ? entityhub.findEntities(query) : // use the Entityhub
site.findEntities(// else the referenced site
query);
log.debug(" - {} results returned by query {}", results.size(), results.getQuery());
if (results.isEmpty()) {
// no results nothing to do
return Collections.emptyList();
}
// we need to normalise the confidence values from [0..1]
// * levenshtein distance as absolute (1.0 for exact match)
// * Solr scores * levenshtein to rank entities relative to each other
Float maxScore = null;
Float maxExactScore = null;
List<Suggestion> matches = new ArrayList<Suggestion>(numSuggestions);
// assumes entities are sorted by score
for (Iterator<Entity> guesses = results.iterator(); guesses.hasNext(); ) {
Suggestion match = new Suggestion(guesses.next());
Representation rep = match.getEntity().getRepresentation();
Float score = rep.getFirst(RdfResourceEnum.resultScore.getUri(), Float.class);
if (maxScore == null) {
maxScore = score;
}
Iterator<Text> labels = rep.getText(nameField);
while (labels.hasNext() && match.getLevenshtein() < 1.0) {
Text label = labels.next();
if (// if the content language is unknown ->
language == null || // accept all labels
label.getLanguage() == // accept labels with no
null || // and labels in the same language as the content
(language != null && label.getLanguage().startsWith(language))) {
double actMatch = levenshtein(casesensitive ? label.getText() : label.getText().toLowerCase(), namedEntityLabel);
if (actMatch > match.getLevenshtein()) {
match.setLevenshtein(actMatch);
match.setMatchedLabel(label);
}
}
}
if (match.getMatchedLabel() != null) {
if (match.getLevenshtein() == 1.0) {
if (maxExactScore == null) {
maxExactScore = score;
}
// normalise exact matches against the best exact score
match.setScore(score.doubleValue() / maxExactScore.doubleValue());
} else {
// normalise partial matches against the best match and the
// Levenshtein similarity with the label
match.setScore(score.doubleValue() * match.getLevenshtein() / maxScore.doubleValue());
}
matches.add(match);
} else {
log.debug("No value of {} for Entity {}!", nameField, match.getEntity().getId());
}
}
// now sort the results
Collections.sort(matches);
return matches.subList(0, Math.min(matches.size(), numSuggestions));
}
use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.
the class ValueFactoryTest method testText.
/**
* Internally used to create and text {@link Text}s for the different tests
*
* @param textString
* the natural language text as string
* @param language
* the language
* @return the created {@link Text} instance that can be used to perform further tests.
*/
private Text testText(String textString, String language) {
ValueFactory vf = getValueFactory();
Text text = vf.createText(textString, language);
assertNotNull(text.getText());
assertNotNull(text.getText());
assertEquals(text.getText(), textString);
if (language == null) {
assertTrue(text.getLanguage() == null);
} else if (language.isEmpty()) {
// implementations are free to change an empty language string to null
// NOTE that it is not allowed to change NULL to an empty String!
assertTrue(text.getLanguage() == null || text.getLanguage().isEmpty());
} else {
assertNotNull(text.getLanguage());
assertEquals(text.getLanguage(), language);
}
return text;
}
use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.
the class RepresentationTest method testNonExistingFields.
/**
* Tests if value iterators for non existing fields return an Iterator with no elements (Here it is
* important, that in such cases methods do not return <code>null</code>).
*/
@Test
public void testNonExistingFields() {
String field = "urn:this.field:does.not:exist";
// Iterators MUST NOT be NULL but MUST NOT contain any element
Representation rep = createRepresentation(null);
Iterator<String> fieldIt = rep.getFieldNames();
assertNotNull(fieldIt);
assertFalse(fieldIt.hasNext());
Iterator<Object> valueIt = rep.get(field);
assertNotNull(valueIt);
assertFalse(valueIt.hasNext());
Iterator<Reference> refIt = rep.getReferences(field);
assertNotNull(refIt);
assertFalse(refIt.hasNext());
Iterator<Text> textIt = rep.get(field, (String[]) null);
assertNotNull(textIt);
assertFalse(textIt.hasNext());
}
use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.
the class RepresentationTest method testRemoveAllTextsOfMultipleLanguages.
@Test
public void testRemoveAllTextsOfMultipleLanguages() {
// remove all texts of multiple languages
String field = "urn:the.field:used.for.this.Test";
Representation rep = initNaturalLanguageTest(field);
Set<String> textSet = new HashSet<String>(NL_TEST_all);
rep.removeAllNaturalText(field, "de", "de-AT");
for (Iterator<Text> texts = rep.getText(field); texts.hasNext(); textSet.remove(texts.next().getText())) ;
assertTrue(textSet.size() == 2);
assertTrue(textSet.remove(NL_TEST_de));
assertTrue(textSet.remove(NL_TEST_de_AT));
}
Aggregations