Search in sources :

Example 51 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class EnhancementEngineHelperTest method testTextAnnotationNewModel.

@Test
public void testTextAnnotationNewModel() {
    String content = "The Stanbol Enhancer can extract Entities form parsed Text.";
    Language lang = new Language("en");
    int start = content.indexOf("Stanbol");
    int end = start + "Stanbol Enhancer".length();
    IRI ciUri = new IRI("http://www.example.org/contentItem#1");
    Graph metadata = new IndexedGraph();
    IRI ta = EnhancementEngineHelper.createTextEnhancement(metadata, dummyEngine, ciUri);
    EnhancementEngineHelper.setOccurrence(metadata, ta, content, start, end, lang, -1, true);
    Assert.assertEquals("The ", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_PREFIX));
    Assert.assertEquals("Stanbol Enhancer", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTED_TEXT));
    Assert.assertEquals(" can extra", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_SUFFIX));
    Assert.assertEquals(Integer.valueOf(start), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_START, Integer.class, lf));
    Assert.assertEquals(Integer.valueOf(end), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_END, Integer.class, lf));
    //head and tail should be null
    Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_HEAD));
    Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_TAIL));
    content = "Ich habe den Schlüssel fürs Donaudampfschiffahrtsgesellschaftskapitänskajütenschloss verlohren.";
    start = content.indexOf("Donaudampfschi");
    end = content.indexOf(" verlohren");
    ta = EnhancementEngineHelper.createTextEnhancement(metadata, dummyEngine, ciUri);
    EnhancementEngineHelper.setOccurrence(metadata, ta, content, start, end, lang, -1, true);
    Assert.assertEquals("ssel fürs ", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_PREFIX));
    Assert.assertEquals(" verlohren", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_SUFFIX));
    Assert.assertEquals(Integer.valueOf(start), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_START, Integer.class, lf));
    Assert.assertEquals(Integer.valueOf(end), EnhancementEngineHelper.get(metadata, ta, Properties.ENHANCER_END, Integer.class, lf));
    //selected text is expected to be null
    Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTED_TEXT));
    //tail and head should be present
    Assert.assertEquals("Donaudampf", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_HEAD));
    Assert.assertEquals("tenschloss", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_TAIL));
    //finally the same but deactivating head/tail
    ta = EnhancementEngineHelper.createTextEnhancement(metadata, dummyEngine, ciUri);
    EnhancementEngineHelper.setOccurrence(metadata, ta, content, start, end, lang, -1, false);
    Assert.assertEquals("Donaudampfschiffahrtsgesellschaftskapitänskajütenschloss", EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTED_TEXT));
    Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_HEAD));
    Assert.assertNull(EnhancementEngineHelper.getString(metadata, ta, Properties.ENHANCER_SELECTION_TAIL));
}
Also used : BigInteger(java.math.BigInteger) IRI(org.apache.clerezza.commons.rdf.IRI) Graph(org.apache.clerezza.commons.rdf.Graph) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) Language(org.apache.clerezza.commons.rdf.Language) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) Test(org.junit.Test)

Example 52 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class RdfEntityFactoryTest method testTypeStatements.

@Test
public void testTypeStatements() throws Exception {
    Graph graph = new SimpleGraph();
    RdfEntityFactory factory = RdfEntityFactory.createInstance(graph);
    String testUri = "urn:RdfEntityFactoryTest:TestEntity";
    IRI node = new IRI(testUri);
    TestRdfEntity entity = factory.getProxy(node, TestRdfEntity.class, new Class[] { TestRdfEntity2.class });
    // test the if the proxy implements both interfaces
    assertTrue(entity instanceof TestRdfEntity);
    assertTrue(entity instanceof TestRdfEntity2);
    Set<String> typeStrings = getRdfTypes(graph, node);
    assertTrue(typeStrings.contains(TestRdfEntity.class.getAnnotation(Rdf.class).id()));
    assertTrue(typeStrings.contains(TestRdfEntity2.class.getAnnotation(Rdf.class).id()));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) RdfEntityFactory(org.apache.stanbol.enhancer.rdfentities.RdfEntityFactory) Rdf(org.apache.stanbol.enhancer.rdfentities.Rdf) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Test(org.junit.Test)

Example 53 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class ClerezzaYard method getRepresentation.

/**
     * Internally used to create Representations for URIs
     * @param uri the uri
     * @param check if <code>false</code> than there is no check if the URI
     *     refers to a RDFTerm in the graph that is of type {@link #REPRESENTATION}
     * @return the Representation
     */
protected final Representation getRepresentation(IRI uri, boolean check) {
    final Lock readLock = readLockGraph();
    try {
        if (!check || isRepresentation(uri)) {
            Graph nodeGraph = createRepresentationGraph(uri, graph);
            //Remove the triple internally used to represent an empty Representation
            // ... this will only remove the triple if the Representation is empty
            //     but a check would take longer than the this call
            nodeGraph.remove(new TripleImpl(uri, MANAGED_REPRESENTATION, TRUE_LITERAL));
            return ((RdfValueFactory) getValueFactory()).createRdfRepresentation(uri, nodeGraph);
        } else {
            //not found
            return null;
        }
    } finally {
        if (readLock != null) {
            readLock.unlock();
        }
    }
}
Also used : ImmutableGraph(org.apache.clerezza.commons.rdf.ImmutableGraph) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) Graph(org.apache.clerezza.commons.rdf.Graph) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) RdfValueFactory(org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory) Lock(java.util.concurrent.locks.Lock)

Example 54 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class DisambiguatorEngine method computeEnhancements.

/*
     * This function first evaluates all the possible ambiguations of each text annotation detected. the text
     * of all entities detected is used for making a Dbpedia query with all string for MLT that contain all
     * the other entities. The results obtained are used to calcualte new confidence values which are updated
     * in the metadata.
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    String textContent;
    Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
    if (textBlob != null) {
        try {
            textContent = ContentItemHelper.getText(textBlob.getValue());
        } catch (IOException e) {
            log.warn("Unable to retieve plain text content for ContentItem " + ci.getUri(), e);
            textContent = null;
        }
    } else {
        textContent = null;
    }
    Graph graph = ci.getMetadata();
    // (1) read the data from the content item
    String contentLangauge;
    DisambiguationData disData;
    ci.getLock().readLock().lock();
    try {
        contentLangauge = EnhancementEngineHelper.getLanguage(ci);
        // NOTE (rwesten): moved the parsing of the information from the
        // contentItem to static method of the Class holding those information
        // (similar as it already was for SavedEntity)
        // readEntities(loseConfidence, allEntities, textAnnotations, graph);
        disData = DisambiguationData.createFromContentItem(ci);
    } finally {
        ci.getLock().readLock().unlock();
    }
    // (2) Disambiguate the SavedEntities
    for (SavedEntity savedEntity : disData.textAnnotations.values()) {
        if (savedEntity.getSuggestions().size() <= 1) {
            // we need not to disambiguate if only one suggestion is present
            continue;
        }
        // NOTE: the site is determined from the
        // fise:TextAnnotation <-- dc:relation --
        // fise:EntityAnnotation -- entityhub:ste --> "{siteName}"^^xsd:string
        // data.
        // TODO: add configuration to include/exclude Sites by name
        Site site = siteManager.getSite(savedEntity.getSite());
        // potential types of entities
        Collection<String> types = null;
        // TODO: make configurable
        boolean casesensitive = false;
        String savedEntityLabel = casesensitive ? savedEntity.getName() : savedEntity.getName().toLowerCase();
        // Determine the context used for disambiguation
        // TODO: make this configurable options
        String disambiguationContext;
        // (0.a) The easiest way is to just use the selection context
        // disambiguationContext = savedEntity.getContext();
        // (0.b) Calculate a context based on a moving window
        String window = getDisambiguationContext(textContent, savedEntity.getName(), savedEntity.getStart(), 100);
        log.info("Use Window: '{}' for '{}'", window, savedEntity.getName());
        // (1) The contextSelections:
        // All other selected text within the selection context
        List<String> contextSelections = getSelectionsInContext(savedEntity.getName(), disData.allSelectedTexts, window);
        // savedEntity.getContext());
        disambiguationContext = unionString(false, contextSelections);
        // (2) I do not understand this variant (see comment for the
        // EntitiesInRange(..) method
        // List<String> L = EntitiesInRange(disData.directoryTextAnotation,
        // (savedEntity.getStart() + savedEntity.getEnd()) / 2);
        // disambiguationContext = unionString(false,contextSelections);
        // (3) one can build a combination of the above
        // disambiguationContext = unionString(true, //unique adds
        // Collections.singleton(savedEntity.getName()), //the selected text
        // Collections.singleton(context), //the context
        // contextSelections); //other selected parsed in the context
        // or just the name of the entity AND the context
        // disambiguationContext = unionString(false,
        // Collections.singleton(savedEntity.getName()),
        // contextSelections);
        // (4) TODO: I would also like to have the possibility to disambiguate
        // using URIs of Entities suggested for other TextAnnotations
        // within the context.
        // make the similarity query on the Entityhub using the collected
        // information
        QueryResultList<Entity> results;
        log.info(" - Query '{}' for {}@{} with context '{}'", new Object[] { site.getId(), savedEntityLabel, contentLangauge, disambiguationContext });
        if (!StringUtils.isBlank(disambiguationContext)) {
            try {
                results = query(site, savedEntityLabel, contentLangauge, disambiguationContext);
            } catch (SiteException e) {
                // TODO we could also try to catch those errors ...
                throw new EngineException("Unable to disambiguate Mention of '" + savedEntity.getName() + "' on Entityhub Site '" + site.getId() + "!", e);
            }
            log.debug(" - {} results returned by query {}", results.size(), results.getQuery());
            // match the results with the suggestions
            disambiguateSuggestions(results, savedEntity);
        } else {
            log.debug(" - not disambiguated because of empty context!");
        }
    }
    // (3) Write back the Results of the Disambiguation process
    // NOTE (rwesten): In the original version of Kritarth this was done as
    // part of (2) - disambiguation. This is now changed as in (2) the
    // disambiguation results are stored in the Suggestions and only
    // applied to the EnhancementStructure in (3). This allows to reduce the
    // coverage of the wirte lock needed to be applied to the ContentItem.
    ci.getLock().writeLock().lock();
    try {
        applyDisambiguationResults(graph, disData);
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : Site(org.apache.stanbol.entityhub.servicesapi.site.Site) IRI(org.apache.clerezza.commons.rdf.IRI) Entity(org.apache.stanbol.entityhub.servicesapi.model.Entity) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) Graph(org.apache.clerezza.commons.rdf.Graph) SiteException(org.apache.stanbol.entityhub.servicesapi.site.SiteException)

Example 55 with Graph

use of org.apache.clerezza.commons.rdf.Graph in project stanbol by apache.

the class EntityCoMentionEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at = getAnalysedText(this, ci, true);
    String language = getLanguage(this, ci, true);
    LanguageProcessingConfig languageConfig = textProcessingConfig.getConfiguration(language);
    if (languageConfig == null) {
        throw new IllegalStateException("The language '" + language + "' is not configured " + "to be processed by this Engine. As this is already checked within the " + "canEnhance(..) method this may indicate an bug in the used " + "EnhanceemntJobManager implementation!");
    }
    if (log.isDebugEnabled()) {
        log.debug("compute co-mentions for ContentItem {} language {}  text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(at.getSpan(), 100) });
    }
    LabelTokenizer labelTokenizer = (LabelTokenizer) labelTokenizerTracker.getService();
    if (labelTokenizer == null) {
        throw new EngineException(this, ci, "No LabelTokenizer available!", null);
    }
    //create the in-memory database for the mentioned Entities
    ContentItemMentionBuilder entityMentionIndex = new ContentItemMentionBuilder(labelTokenizer, language, linkerConfig.getDefaultLanguage());
    Graph metadata = ci.getMetadata();
    Set<IRI> textAnnotations = new HashSet<IRI>();
    ci.getLock().readLock().lock();
    try {
        //iterate over all TextAnnotations (mentions of Entities)
        for (Iterator<Triple> it = metadata.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION); it.hasNext(); ) {
            IRI ta = (IRI) it.next().getSubject();
            entityMentionIndex.registerTextAnnotation(ta, metadata);
            //store the registered text annotations
            textAnnotations.add(ta);
        }
    } finally {
        ci.getLock().readLock().unlock();
    }
    EntityLinker entityLinker = new EntityLinker(at, language, languageConfig, entityMentionIndex, linkerConfig, labelTokenizer, entityMentionIndex);
    //process
    try {
        entityLinker.process();
    } catch (EntitySearcherException e) {
        log.error("Unable to link Entities with " + entityLinker, e);
        throw new EngineException(this, ci, "Unable to link Entities with " + entityLinker, e);
    }
    //TODO: write results
    ci.getLock().writeLock().lock();
    try {
        writeComentions(ci, entityLinker.getLinkedEntities().values(), language, textAnnotations);
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) LanguageProcessingConfig(org.apache.stanbol.enhancer.engines.entitylinking.config.LanguageProcessingConfig) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) EntitySearcherException(org.apache.stanbol.enhancer.engines.entitylinking.EntitySearcherException) EntityLinker(org.apache.stanbol.enhancer.engines.entitylinking.impl.EntityLinker) Triple(org.apache.clerezza.commons.rdf.Triple) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) LabelTokenizer(org.apache.stanbol.enhancer.engines.entitylinking.LabelTokenizer) ContentItemMentionBuilder(org.apache.stanbol.enhancer.engines.entitycomention.impl.ContentItemMentionBuilder) HashSet(java.util.HashSet)

Aggregations

Graph (org.apache.clerezza.commons.rdf.Graph)172 IRI (org.apache.clerezza.commons.rdf.IRI)110 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)66 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)57 Triple (org.apache.clerezza.commons.rdf.Triple)45 IndexedGraph (org.apache.stanbol.commons.indexedgraph.IndexedGraph)43 Test (org.junit.Test)38 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)36 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)34 IOException (java.io.IOException)27 ImmutableGraph (org.apache.clerezza.commons.rdf.ImmutableGraph)26 HashSet (java.util.HashSet)24 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)24 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)24 InputStream (java.io.InputStream)21 HashMap (java.util.HashMap)20 Language (org.apache.clerezza.commons.rdf.Language)17 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)17 ArrayList (java.util.ArrayList)16 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)15