Search in sources :

Example 26 with Text

use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.

the class EnhancementRDFUtils method writeEntityAnnotation.

/**
     * @param literalFactory
     *            the LiteralFactory to use
     * @param graph
     *            the Graph to use
     * @param contentItemId
     *            the contentItemId the enhancement is extracted from
     * @param relatedEnhancements
     *            enhancements this textAnnotation is related to
     * @param suggestion
     *            the entity suggestion
     * @param nameField the field used to extract the name
     * @param lang the preferred language to include or <code>null</code> if none
     */
public static IRI writeEntityAnnotation(EnhancementEngine engine, LiteralFactory literalFactory, Graph graph, IRI contentItemId, Collection<BlankNodeOrIRI> relatedEnhancements, Suggestion suggestion, String nameField, String lang) {
    Representation rep = suggestion.getEntity().getRepresentation();
    // 1. extract the "best label"
    //Start with the matched one
    Text label = suggestion.getMatchedLabel();
    //if the matched label is not in the requested language
    boolean langMatch = (lang == null && label.getLanguage() == null) || (label.getLanguage() != null && label.getLanguage().startsWith(lang));
    //search if a better label is available for this Entity
    if (!langMatch) {
        Iterator<Text> labels = rep.getText(nameField);
        while (labels.hasNext() && !langMatch) {
            Text actLabel = labels.next();
            langMatch = (lang == null && actLabel.getLanguage() == null) || (actLabel.getLanguage() != null && actLabel.getLanguage().startsWith(lang));
            if (langMatch) {
                //if the language matches ->
                //override the matched label
                label = actLabel;
            }
        }
    }
    //else the matched label will be the best to use
    Literal literal;
    if (label.getLanguage() == null) {
        literal = new PlainLiteralImpl(label.getText());
    } else {
        literal = new PlainLiteralImpl(label.getText(), new Language(label.getLanguage()));
    }
    // Now create the entityAnnotation
    IRI entityAnnotation = EnhancementEngineHelper.createEntityEnhancement(graph, engine, contentItemId);
    // first relate this entity annotation to the text annotation(s)
    for (BlankNodeOrIRI enhancement : relatedEnhancements) {
        graph.add(new TripleImpl(entityAnnotation, DC_RELATION, enhancement));
    }
    IRI entityUri = new IRI(rep.getId());
    // add the link to the referred entity
    graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_REFERENCE, entityUri));
    // add the label parsed above
    graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_LABEL, literal));
    if (suggestion.getScore() != null) {
        graph.add(new TripleImpl(entityAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getScore())));
    }
    Iterator<Reference> types = rep.getReferences(RDF_TYPE.getUnicodeString());
    while (types.hasNext()) {
        graph.add(new TripleImpl(entityAnnotation, ENHANCER_ENTITY_TYPE, new IRI(types.next().getReference())));
    }
    //add the name of the ReferencedSite that manages the Entity
    if (suggestion.getEntity().getSite() != null) {
        graph.add(new TripleImpl(entityAnnotation, new IRI(RdfResourceEnum.site.getUri()), new PlainLiteralImpl(suggestion.getEntity().getSite())));
    }
    return entityAnnotation;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Language(org.apache.clerezza.commons.rdf.Language) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) Reference(org.apache.stanbol.entityhub.servicesapi.model.Reference) Literal(org.apache.clerezza.commons.rdf.Literal) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 27 with Text

use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.

the class NamedEntityTaggingEngine method computeEntityRecommentations.

/**
     * Computes the Enhancements
     * 
     * @param site
     *            The {@link SiteException} id or <code>null</code> to use the {@link Entityhub}
     * @param literalFactory
     *            the {@link LiteralFactory} used to create RDF Literals
     * @param contentItemId
     *            the id of the contentItem
     * @param textAnnotation
     *            the text annotation to enhance
     * @param subsumedAnnotations
     *            other text annotations for the same entity
     * @param language
     *            the language of the analysed text or <code>null</code> if not available.
     * @return the suggestions for the parsed {@link NamedEntity}
     * @throws EntityhubException
     *             On any Error while looking up Entities via the Entityhub
     */
protected final List<Suggestion> computeEntityRecommentations(Site site, NamedEntity namedEntity, List<IRI> subsumedAnnotations, String language) throws EntityhubException {
    // First get the required properties for the parsed textAnnotation
    // ... and check the values
    log.debug("Process {}", namedEntity);
    // if site is NULL use
    // the Entityhub
    FieldQueryFactory queryFactory = site == null ? entityhub.getQueryFactory() : site.getQueryFactory();
    log.trace("Will use a query-factory of type [{}].", queryFactory.getClass().toString());
    FieldQuery query = queryFactory.createFieldQuery();
    // replace spaces with plus to create an AND search for all words in the
    // name!
    Constraint labelConstraint;
    // TODO: make case sensitivity configurable
    boolean casesensitive = false;
    String namedEntityLabel = casesensitive ? namedEntity.getName() : namedEntity.getName().toLowerCase();
    if (language != null) {
        // search labels in the language and without language
        labelConstraint = new TextConstraint(namedEntityLabel, casesensitive, language, null);
    } else {
        labelConstraint = new TextConstraint(namedEntityLabel, casesensitive);
    }
    query.setConstraint(nameField, labelConstraint);
    if (OntologicalClasses.DBPEDIA_PERSON.equals(namedEntity.getType())) {
        if (personState) {
            if (personType != null) {
                query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(personType));
            }
        // else no type constraint
        } else {
            // ignore people
            return Collections.emptyList();
        }
    } else if (DBPEDIA_ORGANISATION.equals(namedEntity.getType())) {
        if (orgState) {
            if (orgType != null) {
                query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(orgType));
            }
        // else no type constraint
        } else {
            // ignore people
            return Collections.emptyList();
        }
    } else if (OntologicalClasses.DBPEDIA_PLACE.equals(namedEntity.getType())) {
        if (this.placeState) {
            if (this.placeType != null) {
                query.setConstraint(RDF_TYPE.getUnicodeString(), new ReferenceConstraint(placeType));
            }
        // else no type constraint
        } else {
            // ignore people
            return Collections.emptyList();
        }
    }
    query.setLimit(Math.max(20, this.numSuggestions * 3));
    log.trace("A query has been created of type [{}] and the following settings:\n{}", query.getClass().toString(), query.toString());
    if (null == site)
        log.trace("A query will be sent to the entity-hub of type [{}].", entityhub.getClass());
    else
        log.trace("A query will be sent to a site [id :: {}][type :: {}].", site.getId(), site.getClass());
    QueryResultList<Entity> results = // if site is NULL
    site == null ? entityhub.findEntities(query) : // use the Entityhub
    site.findEntities(// else the referenced site
    query);
    log.debug(" - {} results returned by query {}", results.size(), results.getQuery());
    if (results.isEmpty()) {
        // no results nothing to do
        return Collections.emptyList();
    }
    // we need to normalise the confidence values from [0..1]
    // * levenshtein distance as absolute (1.0 for exact match)
    // * Solr scores * levenshtein to rank entities relative to each other
    Float maxScore = null;
    Float maxExactScore = null;
    List<Suggestion> matches = new ArrayList<Suggestion>(numSuggestions);
    // assumes entities are sorted by score
    for (Iterator<Entity> guesses = results.iterator(); guesses.hasNext(); ) {
        Suggestion match = new Suggestion(guesses.next());
        Representation rep = match.getEntity().getRepresentation();
        Float score = rep.getFirst(RdfResourceEnum.resultScore.getUri(), Float.class);
        if (maxScore == null) {
            maxScore = score;
        }
        Iterator<Text> labels = rep.getText(nameField);
        while (labels.hasNext() && match.getLevenshtein() < 1.0) {
            Text label = labels.next();
            if (// if the content language is unknown ->
            language == null || // accept all labels
            label.getLanguage() == // accept labels with no
            null || // and labels in the same language as the content
            (language != null && label.getLanguage().startsWith(language))) {
                double actMatch = levenshtein(casesensitive ? label.getText() : label.getText().toLowerCase(), namedEntityLabel);
                if (actMatch > match.getLevenshtein()) {
                    match.setLevenshtein(actMatch);
                    match.setMatchedLabel(label);
                }
            }
        }
        if (match.getMatchedLabel() != null) {
            if (match.getLevenshtein() == 1.0) {
                if (maxExactScore == null) {
                    maxExactScore = score;
                }
                // normalise exact matches against the best exact score
                match.setScore(score.doubleValue() / maxExactScore.doubleValue());
            } else {
                // normalise partial matches against the best match and the
                // Levenshtein similarity with the label
                match.setScore(score.doubleValue() * match.getLevenshtein() / maxScore.doubleValue());
            }
            matches.add(match);
        } else {
            log.debug("No value of {} for Entity {}!", nameField, match.getEntity().getId());
        }
    }
    // now sort the results
    Collections.sort(matches);
    return matches.subList(0, Math.min(matches.size(), numSuggestions));
}
Also used : FieldQuery(org.apache.stanbol.entityhub.servicesapi.query.FieldQuery) Entity(org.apache.stanbol.entityhub.servicesapi.model.Entity) ReferenceConstraint(org.apache.stanbol.entityhub.servicesapi.query.ReferenceConstraint) Constraint(org.apache.stanbol.entityhub.servicesapi.query.Constraint) TextConstraint(org.apache.stanbol.entityhub.servicesapi.query.TextConstraint) ArrayList(java.util.ArrayList) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) FieldQueryFactory(org.apache.stanbol.entityhub.servicesapi.query.FieldQueryFactory) ReferenceConstraint(org.apache.stanbol.entityhub.servicesapi.query.ReferenceConstraint) TextConstraint(org.apache.stanbol.entityhub.servicesapi.query.TextConstraint)

Example 28 with Text

use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.

the class ValueFactoryTest method testText.

/**
     * Internally used to create and text {@link Text}s for the different tests
     * 
     * @param textString
     *            the natural language text as string
     * @param language
     *            the language
     * @return the created {@link Text} instance that can be used to perform further tests.
     */
private Text testText(String textString, String language) {
    ValueFactory vf = getValueFactory();
    Text text = vf.createText(textString, language);
    assertNotNull(text.getText());
    assertNotNull(text.getText());
    assertEquals(text.getText(), textString);
    if (language == null) {
        assertTrue(text.getLanguage() == null);
    } else if (language.isEmpty()) {
        // implementations are free to change an empty language string to null
        // NOTE that it is not allowed to change NULL to an empty String!
        assertTrue(text.getLanguage() == null || text.getLanguage().isEmpty());
    } else {
        assertNotNull(text.getLanguage());
        assertEquals(text.getLanguage(), language);
    }
    return text;
}
Also used : Text(org.apache.stanbol.entityhub.servicesapi.model.Text) ValueFactory(org.apache.stanbol.entityhub.servicesapi.model.ValueFactory)

Example 29 with Text

use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.

the class RepresentationTest method testNonExistingFields.

/**
     * Tests if value iterators for non existing fields return an Iterator with no elements (Here it is
     * important, that in such cases methods do not return <code>null</code>).
     */
@Test
public void testNonExistingFields() {
    String field = "urn:this.field:does.not:exist";
    // Iterators MUST NOT be NULL but MUST NOT contain any element
    Representation rep = createRepresentation(null);
    Iterator<String> fieldIt = rep.getFieldNames();
    assertNotNull(fieldIt);
    assertFalse(fieldIt.hasNext());
    Iterator<Object> valueIt = rep.get(field);
    assertNotNull(valueIt);
    assertFalse(valueIt.hasNext());
    Iterator<Reference> refIt = rep.getReferences(field);
    assertNotNull(refIt);
    assertFalse(refIt.hasNext());
    Iterator<Text> textIt = rep.get(field, (String[]) null);
    assertNotNull(textIt);
    assertFalse(textIt.hasNext());
}
Also used : Reference(org.apache.stanbol.entityhub.servicesapi.model.Reference) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) Test(org.junit.Test)

Example 30 with Text

use of org.apache.stanbol.entityhub.servicesapi.model.Text in project stanbol by apache.

the class RepresentationTest method testRemoveAllTextsOfMultipleLanguages.

@Test
public void testRemoveAllTextsOfMultipleLanguages() {
    // remove all texts of multiple languages
    String field = "urn:the.field:used.for.this.Test";
    Representation rep = initNaturalLanguageTest(field);
    Set<String> textSet = new HashSet<String>(NL_TEST_all);
    rep.removeAllNaturalText(field, "de", "de-AT");
    for (Iterator<Text> texts = rep.getText(field); texts.hasNext(); textSet.remove(texts.next().getText())) ;
    assertTrue(textSet.size() == 2);
    assertTrue(textSet.remove(NL_TEST_de));
    assertTrue(textSet.remove(NL_TEST_de_AT));
}
Also used : Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

Text (org.apache.stanbol.entityhub.servicesapi.model.Text)50 Representation (org.apache.stanbol.entityhub.servicesapi.model.Representation)32 Test (org.junit.Test)24 HashSet (java.util.HashSet)14 Reference (org.apache.stanbol.entityhub.servicesapi.model.Reference)12 ArrayList (java.util.ArrayList)11 IRI (org.apache.clerezza.commons.rdf.IRI)6 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)4 Entity (org.apache.stanbol.entityhub.servicesapi.model.Entity)4 ValueFactory (org.apache.stanbol.entityhub.servicesapi.model.ValueFactory)4 RepresentationTest (org.apache.stanbol.entityhub.test.model.RepresentationTest)4 Graph (org.apache.clerezza.commons.rdf.Graph)3 Language (org.apache.clerezza.commons.rdf.Language)3 Literal (org.apache.clerezza.commons.rdf.Literal)3 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)3 EntityhubException (org.apache.stanbol.entityhub.servicesapi.EntityhubException)3 FieldQuery (org.apache.stanbol.entityhub.servicesapi.query.FieldQuery)3 TextConstraint (org.apache.stanbol.entityhub.servicesapi.query.TextConstraint)3 URI (java.net.URI)2 URL (java.net.URL)2