use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class OpenNlpPosTaggingEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, true);
POSTagger posTagger = getPOSTagger(language);
if (posTagger == null) {
// the call to canEnhance and computeEnhancement
throw new EngineException("PosTagger for langauge '" + language + "is not available.");
}
TagSet<PosTag> tagSet = tagSetRegistry.getTagSet(language);
if (tagSet == null) {
log.warn("No POS TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
// for now only created to avoid checks for tagSet == null
// TODO: in future we might want to automatically create posModels based
// on tagged texts. However this makes no sense as long we can not
// persist TagSets.
tagSet = new TagSet<PosTag>("dummy", language);
}
// holds PosTags created for POS tags that where not part of the posModel
// (will hold all PosTags in case tagSet is NULL
Map<String, PosTag> adhocTags = languageAdhocTags.get(language);
if (adhocTags == null) {
adhocTags = new HashMap<String, PosTag>();
languageAdhocTags.put(language, adhocTags);
}
// (1) Sentence detection
// Try to read existing Sentence Annotations
Iterator<Sentence> sentences = at.getSentences();
List<Section> sentenceList;
if (!sentences.hasNext()) {
// if non try to detect sentences
log.trace(" > detect sentences for {}", at);
sentenceList = detectSentences(at, language);
}
if (sentences.hasNext()) {
// check if we have detected sentences
log.trace(" > use existing Sentence annotations for {}", at);
sentenceList = new ArrayList<Section>();
AnalysedTextUtils.appandToList(sentences, sentenceList);
} else {
// no sentence detected ... treat the whole text as a single sentence
// TODO: maybe apply here a limit to the text size!
log.trace(" > unable to detect Sentences for {} (langauge: {})", at, language);
sentenceList = Collections.singletonList((Section) at);
}
// for all sentences (or the whole Text - if no sentences available)
for (Section sentence : sentenceList) {
// (2) Tokenize Sentences
List<Token> tokenList;
// check if there are already tokens
Iterator<Token> tokens = sentence.getTokens();
if (!tokens.hasNext()) {
// no tokens present -> tokenize
log.trace(" > tokenize {}", sentence);
tokenList = tokenize(sentence, language);
} else {
// use existing
log.trace(" > use existing Tokens for {}", sentence);
// ensure an ArrayList is used
tokenList = new ArrayList<Token>();
AnalysedTextUtils.appandToList(tokens, tokenList);
}
// (3) POS Tagging
posTag(tokenList, posTagger, tagSet, adhocTags, language);
}
if (log.isTraceEnabled()) {
logAnnotations(at);
}
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class TestOpenCalaisEngine method testEntityExtraction.
@Test
public void testEntityExtraction() throws IOException, EngineException {
String testFile = "calaisresult.owl";
String format = "application/rdf+xml";
InputStream in = this.getClass().getClassLoader().getResourceAsStream(testFile);
Assert.assertNotNull("failed to load resource " + testFile, in);
Graph model = calaisExtractor.readModel(in, format);
Assert.assertNotNull("model reader failed with format: " + format, model);
Collection<CalaisEntityOccurrence> entities;
try {
entities = calaisExtractor.queryModel(model);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
return;
}
LOG.info("Found entities: {}", entities.size());
LOG.debug("Entities:\n{}", entities);
Assert.assertFalse("No entities found!", entities.isEmpty());
// test the generation of the Enhancements
ContentItem ci = wrapAsContentItem(TEST_TEXT);
calaisExtractor.createEnhancements(entities, ci);
Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(calaisExtractor.getClass().getName()));
// adding null as expected for confidence makes it a required property
expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
validateAllTextAnnotations(ci.getMetadata(), TEST_TEXT, expectedValues);
validateAllEntityAnnotations(ci.getMetadata(), expectedValues);
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class OpenNlpChunkingEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
AnalysedText at = getAnalysedText(this, ci, true);
String language = getLanguage(this, ci, true);
isLangaugeConfigured(this, languageConfiguration, language, true);
ChunkerME chunker = initChunker(language);
if (chunker == null) {
return;
}
// init the Phrase TagSet
TagSet<PhraseTag> tagSet = tagSetRegistry.getTagSet(language);
if (tagSet == null) {
}
if (tagSet == null) {
log.warn("No Phrase TagSet registered for Language '{}'. Will build an " + "adhoc set based on encountered Tags!", language);
// for now only created to avoid checks for tagSet == null
// TODO: in future we might want to automatically create posModels based
// on tagged texts. However this makes no sense as long we can not
// persist TagSets.
tagSet = new TagSet<PhraseTag>("dummy", language);
}
// holds PosTags created for POS tags that where not part of the posModel
// (will hold all PosTags in case tagSet is NULL
Map<String, PhraseTag> adhocTags = languageAdhocTags.get(language);
if (adhocTags == null) {
adhocTags = new HashMap<String, PhraseTag>();
languageAdhocTags.put(language, adhocTags);
}
ci.getLock().writeLock().lock();
try {
Iterator<? extends Section> sentences = at.getSentences();
if (!sentences.hasNext()) {
// no sentences ... iterate over the whole text
sentences = Collections.singleton(at).iterator();
}
List<String> tokenTextList = new ArrayList<String>(64);
List<String> posList = new ArrayList<String>(64);
List<Token> tokenList = new ArrayList<Token>(64);
// process each sentence seperatly
while (sentences.hasNext()) {
// (1) get Tokens and POS information for the sentence
Section sentence = sentences.next();
Iterator<Token> tokens = sentence.getTokens();
while (tokens.hasNext()) {
Token token = tokens.next();
tokenList.add(token);
tokenTextList.add(token.getSpan());
Value<PosTag> posValue = token.getAnnotation(POS_ANNOTATION);
if (posValue == null) {
throw new EngineException("Missing POS value for Token '" + token.getSpan() + "' of ContentItem " + ci.getUri() + "(Sentence: '" + sentence.getSpan() + "'). This may " + "indicate that a POS tagging Engine is missing in " + "the EnhancementChain or that the used POS tagging " + "does not provide POS tags for each token!");
} else {
posList.add(posValue.value().getTag());
}
}
String[] tokenStrings = tokenTextList.toArray(new String[tokenTextList.size()]);
String[] tokenPos = posList.toArray(new String[tokenTextList.size()]);
if (log.isTraceEnabled()) {
log.trace("Tokens: {}" + Arrays.toString(tokenStrings));
}
// free memory
tokenTextList.clear();
// free memory
posList.clear();
// (2) Chunk the sentence
String[] chunkTags = chunker.chunk(tokenStrings, tokenPos);
double[] chunkProb = chunker.probs();
if (log.isTraceEnabled()) {
log.trace("Chunks: {}" + Arrays.toString(chunkTags));
}
// free memory
tokenStrings = null;
// free memory
tokenPos = null;
// (3) Process the results and write the Annotations
double chunkProps = 0;
int chunkTokenCount = 0;
PhraseTag tag = null;
int i;
/*
* This assumes:
* - 'B-{tag}' ... for start of a new chunk
* - '???' ... anything other for continuing the current chunk
* - 'O' ... no chunk (ends current chunk)
*/
for (i = 0; i < tokenList.size(); i++) {
boolean start = chunkTags[i].charAt(0) == 'B';
boolean end = tag != null && (start || chunkTags[i].charAt(0) == 'O');
if (end) {
// add the current phrase
// add at AnalysedText level, because offsets are absolute
// NOTE we are already at the next token when we detect the end
Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
// reset the state
tag = null;
chunkTokenCount = 0;
chunkProps = 0;
}
if (start) {
// create the new tag
tag = getPhraseTag(tagSet, adhocTags, chunkTags[i].substring(2), // skip 'B-'
language);
}
if (tag != null) {
// count this token for the current chunk
chunkProps = chunkProps + chunkProb[i];
chunkTokenCount++;
}
}
if (tag != null) {
Chunk chunk = at.addChunk(tokenList.get(i - chunkTokenCount).getStart(), tokenList.get(i - 1).getEnd());
chunk.addAnnotation(PHRASE_ANNOTATION, new Value<PhraseTag>(tag, chunkProps / (double) chunkTokenCount));
}
// (4) clean up
tokenList.clear();
}
} finally {
ci.getLock().writeLock().unlock();
}
if (log.isTraceEnabled()) {
logChunks(at);
}
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class OpenCalaisEngine method queryModel.
/**
* Extracts the relevant entity information from the Calais RDF data.
* The entities and the relted information is extracted by a Sparql query.
*
* @param model the Graph representing the Calais data
*
* @return a Collection of entity information
* @throws EngineException on a {@link ParseException} while processing the
* Sparql query.
*/
public Collection<CalaisEntityOccurrence> queryModel(Graph model) throws EngineException {
// TODO extract also Geo info (latitude/longitude)?
String query = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " + "PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> " + "PREFIX p: <http://s.opencalais.com/1/pred/> " + "PREFIX t: <http://s.opencalais.com/1/type/em/e/> " + "SELECT DISTINCT ?id ?did ?name ?type ?dtype ?offset ?length ?exact ?context ?score WHERE { " + "?id p:name ?name ." + "?id rdf:type ?type ." + "?y p:subject ?id ." + "?y p:offset ?offset ." + "?y p:length ?length ." + "?y p:exact ?exact ." + "?y p:detection ?context ." + " OPTIONAL { ?z p:subject ?id . ?z p:relevance ?score . } " + // get disambiguated entity references if available
" OPTIONAL { ?did p:subject ?id . ?did p:name ?name . ?did rdf:type ?dtype . } " + "FILTER (" + "?type = t:Person || " + "?type = t:City || " + "?type = t:Continent || " + "?type = t:Country || " + "?type = t:ProvinceOrState || " + "?type = t:Region || " + "?type = t:Company || " + "?type = t:Facility || " + "?type = t:Organization " + ")" + "} ";
Collection<CalaisEntityOccurrence> result = new ArrayList<CalaisEntityOccurrence>();
try {
SelectQuery sQuery = (SelectQuery) QueryParser.getInstance().parse(query);
ResultSet rs = tcManager.executeSparqlQuery(sQuery, model);
while (rs.hasNext()) {
SolutionMapping row = rs.next();
CalaisEntityOccurrence occ = new CalaisEntityOccurrence();
RDFTerm disambiguated = row.get("did");
occ.id = (disambiguated == null ? row.get("id") : disambiguated);
if (onlyNERMode) {
occ.type = row.get("type");
} else {
occ.type = (disambiguated == null ? row.get("type") : row.get("dtype"));
}
if (calaisTypeMap != null) {
IRI mappedType = calaisTypeMap.get(occ.type);
if (mappedType != null) {
occ.type = mappedType;
}
}
occ.name = ((Literal) row.get("name")).getLexicalForm();
occ.exact = ((Literal) row.get("exact")).getLexicalForm();
// TODO for html the offsets might not be those of the original document but refer to a cleaned up version?
occ.offset = Integer.valueOf(((Literal) row.get("offset")).getLexicalForm());
// remove brackets
occ.context = ((Literal) row.get("context")).getLexicalForm().replaceAll("[\\[\\]]", "");
occ.length = Integer.valueOf(((Literal) row.get("length")).getLexicalForm());
if (row.get("score") != null) {
occ.relevance = Double.valueOf(((Literal) row.get("score")).getLexicalForm());
}
result.add(occ);
}
} catch (ParseException e) {
throw new EngineException("Unable to parse SPARQL query for processing OpenCalais results", e);
}
log.info("Found {} occurences", result.size());
return result;
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class DisambiguatorEngine method computeEnhancements.
/*
* This function first evaluates all the possible ambiguations of each text annotation detected. the text
* of all entities detected is used for making a Dbpedia query with all string for MLT that contain all
* the other entities. The results obtained are used to calcualte new confidence values which are updated
* in the metadata.
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
String textContent;
Entry<IRI, Blob> textBlob = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (textBlob != null) {
try {
textContent = ContentItemHelper.getText(textBlob.getValue());
} catch (IOException e) {
log.warn("Unable to retieve plain text content for ContentItem " + ci.getUri(), e);
textContent = null;
}
} else {
textContent = null;
}
Graph graph = ci.getMetadata();
// (1) read the data from the content item
String contentLangauge;
DisambiguationData disData;
ci.getLock().readLock().lock();
try {
contentLangauge = EnhancementEngineHelper.getLanguage(ci);
// NOTE (rwesten): moved the parsing of the information from the
// contentItem to static method of the Class holding those information
// (similar as it already was for SavedEntity)
// readEntities(loseConfidence, allEntities, textAnnotations, graph);
disData = DisambiguationData.createFromContentItem(ci);
} finally {
ci.getLock().readLock().unlock();
}
// (2) Disambiguate the SavedEntities
for (SavedEntity savedEntity : disData.textAnnotations.values()) {
if (savedEntity.getSuggestions().size() <= 1) {
// we need not to disambiguate if only one suggestion is present
continue;
}
// NOTE: the site is determined from the
// fise:TextAnnotation <-- dc:relation --
// fise:EntityAnnotation -- entityhub:ste --> "{siteName}"^^xsd:string
// data.
// TODO: add configuration to include/exclude Sites by name
Site site = siteManager.getSite(savedEntity.getSite());
// potential types of entities
Collection<String> types = null;
// TODO: make configurable
boolean casesensitive = false;
String savedEntityLabel = casesensitive ? savedEntity.getName() : savedEntity.getName().toLowerCase();
// Determine the context used for disambiguation
// TODO: make this configurable options
String disambiguationContext;
// (0.a) The easiest way is to just use the selection context
// disambiguationContext = savedEntity.getContext();
// (0.b) Calculate a context based on a moving window
String window = getDisambiguationContext(textContent, savedEntity.getName(), savedEntity.getStart(), 100);
log.info("Use Window: '{}' for '{}'", window, savedEntity.getName());
// (1) The contextSelections:
// All other selected text within the selection context
List<String> contextSelections = getSelectionsInContext(savedEntity.getName(), disData.allSelectedTexts, window);
// savedEntity.getContext());
disambiguationContext = unionString(false, contextSelections);
// (2) I do not understand this variant (see comment for the
// EntitiesInRange(..) method
// List<String> L = EntitiesInRange(disData.directoryTextAnotation,
// (savedEntity.getStart() + savedEntity.getEnd()) / 2);
// disambiguationContext = unionString(false,contextSelections);
// (3) one can build a combination of the above
// disambiguationContext = unionString(true, //unique adds
// Collections.singleton(savedEntity.getName()), //the selected text
// Collections.singleton(context), //the context
// contextSelections); //other selected parsed in the context
// or just the name of the entity AND the context
// disambiguationContext = unionString(false,
// Collections.singleton(savedEntity.getName()),
// contextSelections);
// (4) TODO: I would also like to have the possibility to disambiguate
// using URIs of Entities suggested for other TextAnnotations
// within the context.
// make the similarity query on the Entityhub using the collected
// information
QueryResultList<Entity> results;
log.info(" - Query '{}' for {}@{} with context '{}'", new Object[] { site.getId(), savedEntityLabel, contentLangauge, disambiguationContext });
if (!StringUtils.isBlank(disambiguationContext)) {
try {
results = query(site, savedEntityLabel, contentLangauge, disambiguationContext);
} catch (SiteException e) {
// TODO we could also try to catch those errors ...
throw new EngineException("Unable to disambiguate Mention of '" + savedEntity.getName() + "' on Entityhub Site '" + site.getId() + "!", e);
}
log.debug(" - {} results returned by query {}", results.size(), results.getQuery());
// match the results with the suggestions
disambiguateSuggestions(results, savedEntity);
} else {
log.debug(" - not disambiguated because of empty context!");
}
}
// (3) Write back the Results of the Disambiguation process
// NOTE (rwesten): In the original version of Kritarth this was done as
// part of (2) - disambiguation. This is now changed as in (2) the
// disambiguation results are stored in the Suggestions and only
// applied to the EnhancementStructure in (3). This allows to reduce the
// coverage of the wirte lock needed to be applied to the ContentItem.
ci.getLock().writeLock().lock();
try {
applyDisambiguationResults(graph, disData);
} finally {
ci.getLock().writeLock().unlock();
}
}
Aggregations