use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.
the class DisambiguatorEngine method computeEnhancements.
/*
* This function first evaluates all the possible ambiguations of each text annotation detected. the text
* of all entities detected is used for making a Dbpedia query with all string for MLT that contain all
* the other entities. The results obtained are used to calcualte new confidence values which are updated
* in the metadata.
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
String textContent;
Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (textBlob != null) {
try {
textContent = ContentItemHelper.getText(textBlob.getValue());
} catch (IOException e) {
log.warn("Unable to retieve plain text content for ContentItem " + ci.getUri(), e);
textContent = null;
}
} else {
textContent = null;
}
Graph graph = ci.getMetadata();
// (1) read the data from the content item
String contentLangauge;
DisambiguationData disData;
ci.getLock().readLock().lock();
try {
contentLangauge = EnhancementEngineHelper.getLanguage(ci);
// NOTE (rwesten): moved the parsing of the information from the
// contentItem to static method of the Class holding those information
// (similar as it already was for SavedEntity)
// readEntities(loseConfidence, allEntities, textAnnotations, graph);
disData = DisambiguationData.createFromContentItem(ci);
} finally {
ci.getLock().readLock().unlock();
}
// (2) Disambiguate the SavedEntities
for (SavedEntity savedEntity : disData.textAnnotations.values()) {
if (savedEntity.getSuggestions().size() <= 1) {
// we need not to disambiguate if only one suggestion is present
continue;
}
// NOTE: the site is determined from the
// fise:TextAnnotation <-- dc:relation --
// fise:EntityAnnotation -- entityhub:ste --> "{siteName}"^^xsd:string
// data.
// TODO: add configuration to include/exclude Sites by name
Site site = siteManager.getSite(savedEntity.getSite());
// potential types of entities
Collection<String> types = null;
// TODO: make configurable
boolean casesensitive = false;
String savedEntityLabel = casesensitive ? savedEntity.getName() : savedEntity.getName().toLowerCase();
// Determine the context used for disambiguation
// TODO: make this configurable options
String disambiguationContext;
// (0.a) The easiest way is to just use the selection context
// disambiguationContext = savedEntity.getContext();
// (0.b) Calculate a context based on a moving window
String window = getDisambiguationContext(textContent, savedEntity.getName(), savedEntity.getStart(), 100);
log.info("Use Window: '{}' for '{}'", window, savedEntity.getName());
// (1) The contextSelections:
// All other selected text within the selection context
List<String> contextSelections = getSelectionsInContext(savedEntity.getName(), disData.allSelectedTexts, window);
// savedEntity.getContext());
disambiguationContext = unionString(false, contextSelections);
// (2) I do not understand this variant (see comment for the
// EntitiesInRange(..) method
// List<String> L = EntitiesInRange(disData.directoryTextAnotation,
// (savedEntity.getStart() + savedEntity.getEnd()) / 2);
// disambiguationContext = unionString(false,contextSelections);
// (3) one can build a combination of the above
// disambiguationContext = unionString(true, //unique adds
// Collections.singleton(savedEntity.getName()), //the selected text
// Collections.singleton(context), //the context
// contextSelections); //other selected parsed in the context
// or just the name of the entity AND the context
// disambiguationContext = unionString(false,
// Collections.singleton(savedEntity.getName()),
// contextSelections);
// (4) TODO: I would also like to have the possibility to disambiguate
// using URIs of Entities suggested for other TextAnnotations
// within the context.
// make the similarity query on the Entityhub using the collected
// information
QueryResultList<Entity> results;
log.info(" - Query '{}' for {}@{} with context '{}'", new Object[] { site.getId(), savedEntityLabel, contentLangauge, disambiguationContext });
if (!StringUtils.isBlank(disambiguationContext)) {
try {
results = query(site, savedEntityLabel, contentLangauge, disambiguationContext);
} catch (SiteException e) {
// TODO we could also try to catch those errors ...
throw new EngineException("Unable to disambiguate Mention of '" + savedEntity.getName() + "' on Entityhub Site '" + site.getId() + "!", e);
}
log.debug(" - {} results returned by query {}", results.size(), results.getQuery());
// match the results with the suggestions
disambiguateSuggestions(results, savedEntity);
} else {
log.debug(" - not disambiguated because of empty context!");
}
}
// (3) Write back the Results of the Disambiguation process
// NOTE (rwesten): In the original version of Kritarth this was done as
// part of (2) - disambiguation. This is now changed as in (2) the
// disambiguation results are stored in the Suggestions and only
// applied to the EnhancementStructure in (3). This allows to reduce the
// coverage of the wirte lock needed to be applied to the ContentItem.
ci.getLock().writeLock().lock();
try {
applyDisambiguationResults(graph, disData);
} finally {
ci.getLock().writeLock().unlock();
}
}
use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.
the class EntityCoReferenceEngineTest method testSpatialCoref.
@Test
public void testSpatialCoref() throws EngineException, IOException {
ContentItem ci = ciFactory.createContentItem(new StringSource(SPATIAL_TEXT));
Graph graph = ci.getMetadata();
IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, engine);
graph.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl("en")));
graph.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, new PlainLiteralImpl("100.0")));
graph.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
AnalysedText at = atFactory.createAnalysedText(ci, textBlob.getValue());
Sentence sentence1 = at.addSentence(0, SPATIAL_SENTENCE_1.indexOf(".") + 1);
Chunk angelaMerkel = sentence1.addChunk(0, "Angela Merkel".length());
angelaMerkel.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(new NerTag("Angela Merkel", OntologicalClasses.DBPEDIA_PERSON)));
Sentence sentence2 = at.addSentence(SPATIAL_SENTENCE_1.indexOf(".") + 1, SPATIAL_SENTENCE_1.length() + SPATIAL_SENTENCE_2.indexOf(".") + 1);
int theStartIdx = sentence2.getSpan().indexOf("The");
int germanStartIdx = sentence2.getSpan().indexOf("German");
int chancellorStartIdx = sentence2.getSpan().indexOf("politician");
Token the = sentence2.addToken(theStartIdx, theStartIdx + "The".length());
the.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("The", LexicalCategory.PronounOrDeterminer, Pos.Determiner)));
Token german = sentence2.addToken(germanStartIdx, germanStartIdx + "German".length());
german.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("German", LexicalCategory.Adjective)));
Token politician = sentence2.addToken(chancellorStartIdx, chancellorStartIdx + "politician".length());
politician.addAnnotation(NlpAnnotations.POS_ANNOTATION, Value.value(new PosTag("politician", LexicalCategory.Noun)));
Chunk theGermanChancellor = sentence2.addChunk(theStartIdx, chancellorStartIdx + "politician".length());
theGermanChancellor.addAnnotation(NlpAnnotations.PHRASE_ANNOTATION, Value.value(new PhraseTag("The German politician", LexicalCategory.Noun)));
engine.computeEnhancements(ci);
Value<CorefFeature> representativeCorefValue = angelaMerkel.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
Assert.assertNotNull(representativeCorefValue);
CorefFeature representativeCoref = representativeCorefValue.value();
Assert.assertTrue(representativeCoref.isRepresentative());
Assert.assertTrue(representativeCoref.getMentions().contains(theGermanChancellor));
Value<CorefFeature> subordinateCorefValue = theGermanChancellor.getAnnotation(NlpAnnotations.COREF_ANNOTATION);
Assert.assertNotNull(subordinateCorefValue);
CorefFeature subordinateCoref = subordinateCorefValue.value();
Assert.assertTrue(!subordinateCoref.isRepresentative());
Assert.assertTrue(subordinateCoref.getMentions().contains(angelaMerkel));
}
use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.
the class BlobTest method testMimeType.
@Test
public void testMimeType() throws IOException {
Blob blob = createBlob(createContentSource("text/plain;charset=UTF-8"));
Assert.assertEquals("text/plain", blob.getMimeType());
Assert.assertTrue(blob.getParameter().containsKey("charset"));
Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
blob = createBlob(createContentSource("text/plain;charset=UTF-8;other=test"));
Assert.assertEquals("text/plain", blob.getMimeType());
Assert.assertTrue(blob.getParameter().containsKey("charset"));
Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
Assert.assertTrue(blob.getParameter().containsKey("other"));
Assert.assertEquals("test", blob.getParameter().get("other"));
}
use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.
the class BlobTest method testIllegalFormatedParameter.
@Test
public void testIllegalFormatedParameter() throws IOException {
Blob blob = createBlob(createContentSource("text/plain;=UTF-8"));
Assert.assertEquals("text/plain", blob.getMimeType());
Assert.assertTrue(blob.getParameter().isEmpty());
blob = createBlob(createContentSource("text/plain;charset=UTF-8;=illegal"));
Assert.assertEquals("text/plain", blob.getMimeType());
Assert.assertEquals(blob.getParameter().size(), 1);
Assert.assertTrue(blob.getParameter().containsKey("charset"));
Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
blob = createBlob(createContentSource("text/plain;=illegal;charset=UTF-8"));
Assert.assertEquals("text/plain", blob.getMimeType());
Assert.assertEquals(blob.getParameter().size(), 1);
Assert.assertTrue(blob.getParameter().containsKey("charset"));
Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
blob = createBlob(createContentSource("text/plain;charset="));
Assert.assertEquals("text/plain", blob.getMimeType());
Assert.assertTrue(blob.getParameter().isEmpty());
blob = createBlob(createContentSource("text/plain;charset"));
Assert.assertEquals("text/plain", blob.getMimeType());
Assert.assertTrue(blob.getParameter().isEmpty());
blob = createBlob(createContentSource("text/plain;charset=UTF-8;test="));
Assert.assertEquals("text/plain", blob.getMimeType());
Assert.assertEquals(blob.getParameter().size(), 1);
Assert.assertTrue(blob.getParameter().containsKey("charset"));
Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
blob = createBlob(createContentSource("text/plain;charset=UTF-8;test"));
Assert.assertEquals("text/plain", blob.getMimeType());
Assert.assertEquals(blob.getParameter().size(), 1);
Assert.assertTrue(blob.getParameter().containsKey("charset"));
Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
blob = createBlob(createContentSource("text/plain;test;charset=UTF-8;"));
Assert.assertEquals("text/plain", blob.getMimeType());
Assert.assertEquals(blob.getParameter().size(), 1);
Assert.assertTrue(blob.getParameter().containsKey("charset"));
Assert.assertEquals("UTF-8", blob.getParameter().get("charset"));
}
use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.
the class BlobTest method testReadOnlyParameter.
@Test(expected = UnsupportedOperationException.class)
public void testReadOnlyParameter() throws IOException {
Blob blob = createBlob(createContentSource("text/plain;test;charset=UTF-8"));
blob.getParameter().put("test", "dummy");
}
Aggregations