Search in sources :

Example 41 with TripleImpl

use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.

the class TestEntityLinkingEnhancementEngine method initContentItem.

/**
 * Creates and initialises a new content item using {@link #CONTEXT} as
 * content and
 * @return
 * @throws IOException
 */
private ContentItem initContentItem() throws IOException {
    ContentItem ci = ciFactory.createContentItem(new IRI("urn:iks-project:enhancer:text:content-item:person"), new StringSource(CONTEXT));
    // add three text annotations to be consumed by this test
    getTextAnnotation(ci, PERSON, CONTEXT, DBPEDIA_PERSON);
    getTextAnnotation(ci, ORGANISATION, CONTEXT, DBPEDIA_ORGANISATION);
    getTextAnnotation(ci, PLACE, CONTEXT, DBPEDIA_PLACE);
    // add the language
    ci.getMetadata().add(new TripleImpl(ci.getUri(), Properties.DC_LANGUAGE, new PlainLiteralImpl("en")));
    return ci;
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem)

Example 42 with TripleImpl

use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.

the class LocationEnhancementEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    IRI contentItemId = ci.getUri();
    Graph graph = ci.getMetadata();
    LiteralFactory literalFactory = LiteralFactory.getInstance();
    // get all the textAnnotations
    /*
         * this Map holds the name as key and all the text annotations of
         * dc:type dbpedia:Place that select this name as value
         * this map is used to avoid multiple lookups for text annotations
         * selecting the same name.
         */
    Map<String, Collection<BlankNodeOrIRI>> name2placeEnhancementMap = new HashMap<String, Collection<BlankNodeOrIRI>>();
    Iterator<Triple> iterator = graph.filter(null, DC_TYPE, DBPEDIA_PLACE);
    while (iterator.hasNext()) {
        // the enhancement annotating an place
        BlankNodeOrIRI placeEnhancement = iterator.next().getSubject();
        // this can still be an TextAnnotation of an EntityAnnotation
        // so we need to filter TextAnnotation
        Triple isTextAnnotation = new TripleImpl(placeEnhancement, RDF_TYPE, ENHANCER_TEXTANNOTATION);
        if (graph.contains(isTextAnnotation)) {
            // now get the name
            String name = EnhancementEngineHelper.getString(graph, placeEnhancement, ENHANCER_SELECTED_TEXT);
            if (name == null) {
                log.warn("Unable to process TextAnnotation " + placeEnhancement + " because property" + ENHANCER_SELECTED_TEXT + " is not present");
            } else {
                Collection<BlankNodeOrIRI> placeEnhancements = name2placeEnhancementMap.get(name);
                if (placeEnhancements == null) {
                    placeEnhancements = new ArrayList<BlankNodeOrIRI>();
                    name2placeEnhancementMap.put(name, placeEnhancements);
                }
                placeEnhancements.add(placeEnhancement);
            }
        } else {
        // TODO: if we also ant to process EntityAnnotations with the dc:type dbpedia:Place
        // than we need to parse the name based on the enhancer:entity-name property
        }
    }
    // Now we do have all the names we need to lookup
    Map<SearchRequestPropertyEnum, Collection<String>> requestParams = new EnumMap<SearchRequestPropertyEnum, Collection<String>>(SearchRequestPropertyEnum.class);
    if (getMaxLocationEnhancements() != null) {
        requestParams.put(SearchRequestPropertyEnum.maxRows, Collections.singleton(getMaxLocationEnhancements().toString()));
    }
    for (Map.Entry<String, Collection<BlankNodeOrIRI>> entry : name2placeEnhancementMap.entrySet()) {
        List<Toponym> results;
        try {
            requestParams.put(SearchRequestPropertyEnum.name, Collections.singleton(entry.getKey()));
            results = geonamesService.searchToponyms(requestParams);
        } catch (Exception e) {
            /*
                     * TODO: Review if it makes sense to catch here for each name, or
                     * to catch the whole loop.
                     * This depends if single requests can result in Exceptions
                     * (e.g. because of encoding problems) or if usually Exceptions
                     * are thrown because of general things like connection issues
                     * or service unavailability.
                     */
            throw new EngineException(this, ci, e);
        }
        if (results != null) {
            Double maxScore = results.isEmpty() ? null : results.get(0).getScore();
            for (Toponym result : results) {
                log.debug("process result {} {}", result.getGeoNameId(), result.getName());
                Double score = getToponymScore(result, maxScore);
                log.debug("  > score {}", score);
                if (score != null) {
                    if (score < minScore) {
                        // if score is lower than the under bound, than stop
                        break;
                    }
                } else {
                    log.warn("NULL returned as Score for " + result.getGeoNameId() + " " + result.getName());
                /*
                         * NOTE: If score is not present all suggestions are
                         * added as enhancements to the metadata of the content
                         * item.
                         */
                }
                // write the enhancement!
                BlankNodeOrIRI locationEnhancement = writeEntityEnhancement(contentItemId, graph, literalFactory, result, entry.getValue(), null, score);
                log.debug("  > {}  >= {}", score, minHierarchyScore);
                if (score != null && score >= minHierarchyScore) {
                    log.debug("  > getHierarchy for {} {}", result.getGeoNameId(), result.getName());
                    // get the hierarchy
                    try {
                        Iterator<Toponym> hierarchy = getHierarchy(result).iterator();
                        for (int level = 0; hierarchy.hasNext(); level++) {
                            Toponym hierarchyEntry = hierarchy.next();
                            // maybe add an configuration
                            if (level == 0) {
                                // Mother earth -> ignore
                                continue;
                            }
                            // write it as dependent to the locationEnhancement
                            if (result.getGeoNameId() != hierarchyEntry.getGeoNameId()) {
                                // TODO: add additional checks based on possible
                                // configuration here!
                                log.debug("    - write hierarchy {} {}", hierarchyEntry.getGeoNameId(), hierarchyEntry.getName());
                                /*
                                     * The hierarchy service dose not provide a score, because it would be 1.0
                                     * so we need to set the score to this value.
                                     * Currently is is set to the value of the suggested entry
                                     */
                                writeEntityEnhancement(contentItemId, graph, literalFactory, hierarchyEntry, null, Collections.singletonList(locationEnhancement), 1.0);
                            }
                        }
                    } catch (Exception e) {
                        log.warn("Unable to get Hierarchy for " + result.getGeoNameId() + " " + result.getName(), e);
                    }
                }
            }
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) HashMap(java.util.HashMap) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) EnumMap(java.util.EnumMap) SearchRequestPropertyEnum(org.apache.stanbol.enhancer.engines.geonames.impl.GeonamesAPIWrapper.SearchRequestPropertyEnum) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) ConfigurationException(org.osgi.service.cm.ConfigurationException) IOException(java.io.IOException) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) Triple(org.apache.clerezza.commons.rdf.Triple) Graph(org.apache.clerezza.commons.rdf.Graph) Collection(java.util.Collection) Map(java.util.Map) EnumMap(java.util.EnumMap) HashMap(java.util.HashMap)

Example 43 with TripleImpl

use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.

the class EntityCoReferenceEngineTest method testSpatialCoref.

@Test
public void testSpatialCoref() throws EngineException, IOException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(SPATIAL_TEXT));
    Graph graph = ci.getMetadata();
    IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, engine);
    graph.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl("en")));
    graph.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, new PlainLiteralImpl("100.0")));
    graph.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
    Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
    AnalysedText at = atFactory.createAnalysedText(ci, textBlob.getValue());
    Sentence sentence1 = at.addSentence(0, SPATIAL_SENTENCE_1.indexOf(".") + 1);
    Chunk angelaMerkel = sentence1.addChunk(0, "Angela Merkel".length());
    angelaMerkel.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("Angela Merkel", OntologicalClasses.DBPEDIA_PERSON)));
    Sentence sentence2 = at.addSentence(SPATIAL_SENTENCE_1.indexOf(".") + 1, SPATIAL_SENTENCE_1.length() + SPATIAL_SENTENCE_2.indexOf(".") + 1);
    int theStartIdx = sentence2.getSpan().indexOf("The");
    int germanStartIdx = sentence2.getSpan().indexOf("German");
    int chancellorStartIdx = sentence2.getSpan().indexOf("politician");
    Token the = sentence2.addToken(theStartIdx, theStartIdx + "The".length());
    the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("The", LexicalCategory.PronounOrDeterminer, Pos.Determiner)));
    Token german = sentence2.addToken(germanStartIdx, germanStartIdx + "German".length());
    german.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("German", LexicalCategory.Adjective)));
    Token politician = sentence2.addToken(chancellorStartIdx, chancellorStartIdx + "politician".length());
    politician.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("politician", LexicalCategory.Noun)));
    Chunk theGermanChancellor = sentence2.addChunk(theStartIdx, chancellorStartIdx + "politician".length());
    theGermanChancellor.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("The German politician", LexicalCategory.Noun)));
    engine.computeEnhancements(ci);
    Value<CorefFeature> representativeCorefValue = angelaMerkel.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(representativeCorefValue);
    CorefFeature representativeCoref = representativeCorefValue.value();
    Assert.assertTrue(representativeCoref.isRepresentative());
    Assert.assertTrue(representativeCoref.getMentions().contains(theGermanChancellor));
    Value<CorefFeature> subordinateCorefValue = theGermanChancellor.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(subordinateCorefValue);
    CorefFeature subordinateCoref = subordinateCorefValue.value();
    Assert.assertTrue(!subordinateCoref.isRepresentative());
    Assert.assertTrue(subordinateCoref.getMentions().contains(angelaMerkel));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) CorefFeature(org.apache.stanbol.enhancer.nlp.coref.CorefFeature) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 44 with TripleImpl

use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.

the class EntityLinkingEngineTest method setUpServices.

@BeforeClass
public static void setUpServices() throws IOException {
    searcher = new TestSearcherImpl(TEST_REFERENCED_SITE_NAME, NAME, new SimpleLabelTokenizer());
    // add some terms to the searcher
    Graph graph = new IndexedGraph();
    IRI uri = new IRI("urn:test:PatrickMarshall");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Patrick Marshall")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PERSON));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:Geologist");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Geologist")));
    graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
    graph.add(new TripleImpl(uri, REDIRECT, new IRI("urn:test:redirect:Geologist")));
    searcher.addEntity(new Entity(uri, graph));
    // a redirect
    uri = new IRI("urn:test:redirect:Geologist");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Geologe (redirect)")));
    graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:NewZealand");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("New Zealand")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:UniversityOfOtago");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University of Otago")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_ORGANISATION));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:University");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University")));
    graph.add(new TripleImpl(uri, TYPE, new IRI(NamespaceEnum.skos + "Concept")));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:Otago");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
    searcher.addEntity(new Entity(uri, graph));
    // add a 2nd Otago (Place and University
    uri = new IRI("urn:test:Otago_Texas");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago (Texas)")));
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("Otago")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_PLACE));
    searcher.addEntity(new Entity(uri, graph));
    uri = new IRI("urn:test:UniversityOfOtago_Texas");
    graph.add(new TripleImpl(uri, NAME, new PlainLiteralImpl("University of Otago (Texas)")));
    graph.add(new TripleImpl(uri, TYPE, OntologicalClasses.DBPEDIA_ORGANISATION));
    searcher.addEntity(new Entity(uri, graph));
    TEST_ANALYSED_TEXT = AnalysedTextFactory.getDefaultInstance().createAnalysedText(ciFactory.createBlob(new StringSource(TEST_TEXT)));
    TEST_ANALYSED_TEXT_WO = AnalysedTextFactory.getDefaultInstance().createAnalysedText(ciFactory.createBlob(new StringSource(TEST_TEXT_WO)));
    initAnalyzedText(TEST_ANALYSED_TEXT);
    TEST_ANALYSED_TEXT.addChunk(0, "Dr. Patrick Marshall".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
    TEST_ANALYSED_TEXT.addToken(4, 11).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
    TEST_ANALYSED_TEXT.addToken(12, 20).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
    initAnalyzedText(TEST_ANALYSED_TEXT_WO);
    TEST_ANALYSED_TEXT_WO.addChunk(0, "Dr. Marshall Patrick".length()).addAnnotation(PHRASE_ANNOTATION, NOUN_PHRASE);
    TEST_ANALYSED_TEXT_WO.addToken(4, 12).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
    TEST_ANALYSED_TEXT_WO.addToken(13, 20).addAnnotation(POS_ANNOTATION, Value.value(new PosTag("NP", Pos.ProperNoun), 1d));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) LinkedEntity(org.apache.stanbol.enhancer.engines.entitylinking.impl.LinkedEntity) Entity(org.apache.stanbol.enhancer.engines.entitylinking.Entity) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) Graph(org.apache.clerezza.commons.rdf.Graph) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) SimpleLabelTokenizer(org.apache.stanbol.enhancer.engines.entitylinking.labeltokenizer.SimpleLabelTokenizer) TestSearcherImpl(org.apache.stanbol.enhancer.engines.entitylinking.impl.TestSearcherImpl) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) BeforeClass(org.junit.BeforeClass)

Example 45 with TripleImpl

use of org.apache.clerezza.commons.rdf.impl.utils.TripleImpl in project stanbol by apache.

the class NIFHelper method writePhrase.

/**
 * Writes a {@link NlpAnnotations#PHRASE_ANNOTATION} as NIF 1.0 to the
 * parsed RDF graph by using the segmentUri as subject
 * @param graph the graph
 * @param annotated the annotated element (e.g. a {@link Chunk})
 * @param segmentUri the URI of the resource representing the parsed
 * annotated element in the graph
 */
public static void writePhrase(Graph graph, Annotated annotated, IRI segmentUri) {
    Value<PhraseTag> phraseTag = annotated.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
    if (phraseTag != null) {
        IRI phraseTypeUri = LEXICAL_TYPE_TO_PHRASE_TYPE.get(phraseTag.value().getCategory());
        if (phraseTypeUri != null) {
            // add the oliaLink for the Phrase
            graph.add(new TripleImpl(segmentUri, SsoOntology.oliaLink.getUri(), phraseTypeUri));
            graph.add(new TripleImpl(segmentUri, ENHANCER_CONFIDENCE, lf.createTypedLiteral(phraseTag.probability())));
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag)

Aggregations

TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)143 IRI (org.apache.clerezza.commons.rdf.IRI)104 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)69 Graph (org.apache.clerezza.commons.rdf.Graph)66 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)49 Triple (org.apache.clerezza.commons.rdf.Triple)41 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)26 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)23 HashMap (java.util.HashMap)20 Language (org.apache.clerezza.commons.rdf.Language)20 Literal (org.apache.clerezza.commons.rdf.Literal)20 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)20 IOException (java.io.IOException)18 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)17 Test (org.junit.Test)16 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)15 IndexedGraph (org.apache.stanbol.commons.indexedgraph.IndexedGraph)14 HashSet (java.util.HashSet)13 StringSource (org.apache.stanbol.enhancer.servicesapi.impl.StringSource)13 BlankNode (org.apache.clerezza.commons.rdf.BlankNode)11