Search in sources :

Example 16 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class DisambiguatorEngine method computeEnhancements.

/*
     * This function first evaluates all the possible ambiguations of each text annotation detected. the text
     * of all entities detected is used for making a Dbpedia query with all string for MLT that contain all
     * the other entities. The results obtained are used to calcualte new confidence values which are updated
     * in the metadata.
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    String textContent;
    Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
    if (textBlob != null) {
        try {
            textContent = ContentItemHelper.getText(textBlob.getValue());
        } catch (IOException e) {
            log.warn("Unable to retieve plain text content for ContentItem " + ci.getUri(), e);
            textContent = null;
        }
    } else {
        textContent = null;
    }
    Graph graph = ci.getMetadata();
    // (1) read the data from the content item
    String contentLangauge;
    DisambiguationData disData;
    ci.getLock().readLock().lock();
    try {
        contentLangauge = EnhancementEngineHelper.getLanguage(ci);
        // NOTE (rwesten): moved the parsing of the information from the
        // contentItem to static method of the Class holding those information
        // (similar as it already was for SavedEntity)
        // readEntities(loseConfidence, allEntities, textAnnotations, graph);
        disData = DisambiguationData.createFromContentItem(ci);
    } finally {
        ci.getLock().readLock().unlock();
    }
    // (2) Disambiguate the SavedEntities
    for (SavedEntity savedEntity : disData.textAnnotations.values()) {
        if (savedEntity.getSuggestions().size() <= 1) {
            // we need not to disambiguate if only one suggestion is present
            continue;
        }
        // NOTE: the site is determined from the
        // fise:TextAnnotation <-- dc:relation --
        // fise:EntityAnnotation -- entityhub:ste --> "{siteName}"^^xsd:string
        // data.
        // TODO: add configuration to include/exclude Sites by name
        Site site = siteManager.getSite(savedEntity.getSite());
        // potential types of entities
        Collection<String> types = null;
        // TODO: make configurable
        boolean casesensitive = false;
        String savedEntityLabel = casesensitive ? savedEntity.getName() : savedEntity.getName().toLowerCase();
        // Determine the context used for disambiguation
        // TODO: make this configurable options
        String disambiguationContext;
        // (0.a) The easiest way is to just use the selection context
        // disambiguationContext = savedEntity.getContext();
        // (0.b) Calculate a context based on a moving window
        String window = getDisambiguationContext(textContent, savedEntity.getName(), savedEntity.getStart(), 100);
        log.info("Use Window: '{}' for '{}'", window, savedEntity.getName());
        // (1) The contextSelections:
        // All other selected text within the selection context
        List<String> contextSelections = getSelectionsInContext(savedEntity.getName(), disData.allSelectedTexts, window);
        // savedEntity.getContext());
        disambiguationContext = unionString(false, contextSelections);
        // (2) I do not understand this variant (see comment for the
        // EntitiesInRange(..) method
        // List<String> L = EntitiesInRange(disData.directoryTextAnotation,
        // (savedEntity.getStart() + savedEntity.getEnd()) / 2);
        // disambiguationContext = unionString(false,contextSelections);
        // (3) one can build a combination of the above
        // disambiguationContext = unionString(true, //unique adds
        // Collections.singleton(savedEntity.getName()), //the selected text
        // Collections.singleton(context), //the context
        // contextSelections); //other selected parsed in the context
        // or just the name of the entity AND the context
        // disambiguationContext = unionString(false,
        // Collections.singleton(savedEntity.getName()),
        // contextSelections);
        // (4) TODO: I would also like to have the possibility to disambiguate
        // using URIs of Entities suggested for other TextAnnotations
        // within the context.
        // make the similarity query on the Entityhub using the collected
        // information
        QueryResultList<Entity> results;
        log.info(" - Query '{}' for {}@{} with context '{}'", new Object[] { site.getId(), savedEntityLabel, contentLangauge, disambiguationContext });
        if (!StringUtils.isBlank(disambiguationContext)) {
            try {
                results = query(site, savedEntityLabel, contentLangauge, disambiguationContext);
            } catch (SiteException e) {
                // TODO we could also try to catch those errors ...
                throw new EngineException("Unable to disambiguate Mention of '" + savedEntity.getName() + "' on Entityhub Site '" + site.getId() + "!", e);
            }
            log.debug(" - {} results returned by query {}", results.size(), results.getQuery());
            // match the results with the suggestions
            disambiguateSuggestions(results, savedEntity);
        } else {
            log.debug(" - not disambiguated because of empty context!");
        }
    }
    // (3) Write back the Results of the Disambiguation process
    // NOTE (rwesten): In the original version of Kritarth this was done as
    // part of (2) - disambiguation. This is now changed as in (2) the
    // disambiguation results are stored in the Suggestions and only
    // applied to the EnhancementStructure in (3). This allows to reduce the
    // coverage of the wirte lock needed to be applied to the ContentItem.
    ci.getLock().writeLock().lock();
    try {
        applyDisambiguationResults(graph, disData);
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : Site(org.apache.stanbol.entityhub.servicesapi.site.Site) IRI(org.apache.clerezza.commons.rdf.IRI) Entity(org.apache.stanbol.entityhub.servicesapi.model.Entity) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) Graph(org.apache.clerezza.commons.rdf.Graph) SiteException(org.apache.stanbol.entityhub.servicesapi.site.SiteException)

Example 17 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class EntityCoReferenceEngineTest method testSpatialCoref.

@Test
public void testSpatialCoref() throws EngineException, IOException {
    ContentItem ci = ciFactory.createContentItem(new StringSource(SPATIAL_TEXT));
    Graph graph = ci.getMetadata();
    IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, engine);
    graph.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl("en")));
    graph.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, new PlainLiteralImpl("100.0")));
    graph.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
    Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
    AnalysedText at = atFactory.createAnalysedText(ci, textBlob.getValue());
    Sentence sentence1 = at.addSentence(0, SPATIAL_SENTENCE_1.indexOf(".") + 1);
    Chunk angelaMerkel = sentence1.addChunk(0, "Angela Merkel".length());
    angelaMerkel.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("Angela Merkel", OntologicalClasses.DBPEDIA_PERSON)));
    Sentence sentence2 = at.addSentence(SPATIAL_SENTENCE_1.indexOf(".") + 1, SPATIAL_SENTENCE_1.length() + SPATIAL_SENTENCE_2.indexOf(".") + 1);
    int theStartIdx = sentence2.getSpan().indexOf("The");
    int germanStartIdx = sentence2.getSpan().indexOf("German");
    int chancellorStartIdx = sentence2.getSpan().indexOf("politician");
    Token the = sentence2.addToken(theStartIdx, theStartIdx + "The".length());
    the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("The", LexicalCategory.PronounOrDeterminer, Pos.Determiner)));
    Token german = sentence2.addToken(germanStartIdx, germanStartIdx + "German".length());
    german.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("German", LexicalCategory.Adjective)));
    Token politician = sentence2.addToken(chancellorStartIdx, chancellorStartIdx + "politician".length());
    politician.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("politician", LexicalCategory.Noun)));
    Chunk theGermanChancellor = sentence2.addChunk(theStartIdx, chancellorStartIdx + "politician".length());
    theGermanChancellor.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("The German politician", LexicalCategory.Noun)));
    engine.computeEnhancements(ci);
    Value<CorefFeature> representativeCorefValue = angelaMerkel.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(representativeCorefValue);
    CorefFeature representativeCoref = representativeCorefValue.value();
    Assert.assertTrue(representativeCoref.isRepresentative());
    Assert.assertTrue(representativeCoref.getMentions().contains(theGermanChancellor));
    Value<CorefFeature> subordinateCorefValue = theGermanChancellor.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
    Assert.assertNotNull(subordinateCorefValue);
    CorefFeature subordinateCoref = subordinateCorefValue.value();
    Assert.assertTrue(!subordinateCoref.isRepresentative());
    Assert.assertTrue(subordinateCoref.getMentions().contains(angelaMerkel));
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) CorefFeature(org.apache.stanbol.enhancer.nlp.coref.CorefFeature) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) Token(org.apache.stanbol.enhancer.nlp.model.Token) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) PhraseTag(org.apache.stanbol.enhancer.nlp.phrase.PhraseTag) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Graph(org.apache.clerezza.commons.rdf.Graph) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) StringSource(org.apache.stanbol.enhancer.servicesapi.impl.StringSource) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 18 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class BlobTest method testMimeType.

@Test
public void testMimeType() throws IOException {
    Blob blob = createBlob(createContentSource("text/plain;charset=UTF-8"));
    Assert.assertEquals("text/plain", blob.getMimeType());
    Assert.assertTrue(blob.getParameter().containsKey("charset"));
    Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
    blob = createBlob(createContentSource("text/plain;charset=UTF-8;other=test"));
    Assert.assertEquals("text/plain", blob.getMimeType());
    Assert.assertTrue(blob.getParameter().containsKey("charset"));
    Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
    Assert.assertTrue(blob.getParameter().containsKey("other"));
    Assert.assertEquals("test", blob.getParameter().get("other"));
}
Also used : Blob(org.apache.stanbol.enhancer.servicesapi.Blob) Test(org.junit.Test)

Example 19 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class BlobTest method testIllegalFormatedParameter.

@Test
public void testIllegalFormatedParameter() throws IOException {
    Blob blob = createBlob(createContentSource("text/plain;=UTF-8"));
    Assert.assertEquals("text/plain", blob.getMimeType());
    Assert.assertTrue(blob.getParameter().isEmpty());
    blob = createBlob(createContentSource("text/plain;charset=UTF-8;=illegal"));
    Assert.assertEquals("text/plain", blob.getMimeType());
    Assert.assertEquals(blob.getParameter().size(), 1);
    Assert.assertTrue(blob.getParameter().containsKey("charset"));
    Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
    blob = createBlob(createContentSource("text/plain;=illegal;charset=UTF-8"));
    Assert.assertEquals("text/plain", blob.getMimeType());
    Assert.assertEquals(blob.getParameter().size(), 1);
    Assert.assertTrue(blob.getParameter().containsKey("charset"));
    Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
    blob = createBlob(createContentSource("text/plain;charset="));
    Assert.assertEquals("text/plain", blob.getMimeType());
    Assert.assertTrue(blob.getParameter().isEmpty());
    blob = createBlob(createContentSource("text/plain;charset"));
    Assert.assertEquals("text/plain", blob.getMimeType());
    Assert.assertTrue(blob.getParameter().isEmpty());
    blob = createBlob(createContentSource("text/plain;charset=UTF-8;test="));
    Assert.assertEquals("text/plain", blob.getMimeType());
    Assert.assertEquals(blob.getParameter().size(), 1);
    Assert.assertTrue(blob.getParameter().containsKey("charset"));
    Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
    blob = createBlob(createContentSource("text/plain;charset=UTF-8;test"));
    Assert.assertEquals("text/plain", blob.getMimeType());
    Assert.assertEquals(blob.getParameter().size(), 1);
    Assert.assertTrue(blob.getParameter().containsKey("charset"));
    Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
    blob = createBlob(createContentSource("text/plain;test;charset=UTF-8;"));
    Assert.assertEquals("text/plain", blob.getMimeType());
    Assert.assertEquals(blob.getParameter().size(), 1);
    Assert.assertTrue(blob.getParameter().containsKey("charset"));
    Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
}
Also used : Blob(org.apache.stanbol.enhancer.servicesapi.Blob) Test(org.junit.Test)

Example 20 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class BlobTest method testReadOnlyParameter.

@Test(expected = UnsupportedOperationException.class)
public void testReadOnlyParameter() throws IOException {
    Blob blob = createBlob(createContentSource("text/plain;test;charset=UTF-8"));
    blob.getParameter().put("test", "dummy");
}
Also used : Blob(org.apache.stanbol.enhancer.servicesapi.Blob) Test(org.junit.Test)

Aggregations

Blob (org.apache.stanbol.enhancer.servicesapi.Blob)44 IRI (org.apache.clerezza.commons.rdf.IRI)36 Test (org.junit.Test)21 IOException (java.io.IOException)20 Graph (org.apache.clerezza.commons.rdf.Graph)17 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)15 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)15 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)15 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)14 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)11 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)11 SOAPException (javax.xml.soap.SOAPException)4 Language (org.apache.clerezza.commons.rdf.Language)4 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 MediaType (javax.ws.rs.core.MediaType)3 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)3 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)3 NoSuchPartException (org.apache.stanbol.enhancer.servicesapi.NoSuchPartException)3 StreamSource (org.apache.stanbol.enhancer.servicesapi.impl.StreamSource)3