use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class UIMALocal method computeEnhancements.
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
if (contentPart == null) {
throw new IllegalStateException("No ContentPart with an supported Mimetype '" + SUPPORTED_MIMETYPES + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text;
try {
text = ContentItemHelper.getText(contentPart.getValue());
} catch (IOException e) {
throw new InvalidContentException(this, ci, e);
}
JCas jcas;
try {
logger.info("Processing text with UIMA AE...");
jcas = processText(text);
} catch (ResourceInitializationException ex) {
logger.error("Error initializing UIMA AE", ex);
throw new EngineException("Error initializing UIMA AE", ex);
} catch (AnalysisEngineProcessException ex) {
logger.error("Error running UIMA AE", ex);
throw new EngineException("Error running UIMA AE", ex);
}
// just for being sure
if (jcas == null) {
return;
}
for (String typeName : uimaTypeNames) {
List<FeatureStructure> featureSetList = concertToCasLight(jcas, typeName);
IRI uimaIRI = new IRI(uimaUri);
FeatureStructureListHolder holder;
ci.getLock().writeLock().lock();
try {
holder = ci.getPart(uimaIRI, FeatureStructureListHolder.class);
} catch (NoSuchPartException e) {
holder = new FeatureStructureListHolder();
logger.info("Adding FeatureSet List Holder content part with uri:" + uimaUri);
ci.addPart(uimaIRI, holder);
logger.info(uimaUri + " content part added.");
} finally {
ci.getLock().writeLock().unlock();
}
ci.getLock().writeLock().lock();
try {
holder.addFeatureStructureList(uimaSourceName, featureSetList);
} finally {
ci.getLock().writeLock().unlock();
}
}
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class ZemantaEnhancementEngineTest method tesetBioText.
@Test
public void tesetBioText() throws EngineException, IOException {
ContentItem ci = wrapAsContentItem(BIO_DOMAIN_TEXT);
try {
zemantaEngine.computeEnhancements(ci);
} catch (EngineException e) {
RemoteServiceHelper.checkServiceUnavailable(e);
return;
}
JenaSerializerProvider serializer = new JenaSerializerProvider();
serializer.serialize(System.out, ci.getMetadata(), TURTLE);
Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(zemantaEngine.getClass().getName()));
// deactivate require fise:confidence values for fise:TextAnnotations, because
// the one used to group the TopicAnnotations does not have a confidence value
int textAnnoNum = validateAllTextAnnotations(ci.getMetadata(), BIO_DOMAIN_TEXT, expectedValues);
log.info(textAnnoNum + " TextAnnotations found ...");
// adding null as expected for confidence makes it a required property
expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
int entityAnnoNum = EnhancementStructureHelper.validateAllEntityAnnotations(ci.getMetadata(), expectedValues);
log.info(entityAnnoNum + " EntityAnnotations found ...");
int topicAnnoNum = EnhancementStructureHelper.validateAllTopicAnnotations(ci.getMetadata(), expectedValues);
log.info(topicAnnoNum + " TopicAnnotations found ...");
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class EntityDereferenceEngine method computeEnhancements.
@Override
public final void computeEnhancements(ContentItem ci) throws EngineException {
if (offline && !dereferencer.supportsOfflineMode()) {
// entity dereferencer does no longer support offline mode
return;
}
log.debug("> dereference Entities for ContentItem {}", ci.getUri());
long start = System.nanoTime();
Map<String, Object> enhancemntProps = EnhancementEngineHelper.getEnhancementProperties(this, ci);
final DereferenceContext derefContext;
final Graph metadata = ci.getMetadata();
Set<IRI> referencedEntities = new HashSet<IRI>();
ci.getLock().readLock().lock();
try {
// (1) Create the DereferenceContext
if (filterContentLanguages) {
// parse the languages detected for the content
Set<String> contentLanguages = new HashSet<String>();
for (BlankNodeOrIRI langAnno : EnhancementEngineHelper.getLanguageAnnotations(metadata)) {
contentLanguages.add(EnhancementEngineHelper.getString(metadata, langAnno, DC_LANGUAGE));
}
enhancemntProps.put(DereferenceContext.INTERNAL_CONTENT_LANGUAGES, contentLanguages);
}
// create the dereference context and handle possible configuration exceptions
try {
derefContext = contextFactory.createContext(this, enhancemntProps);
derefContext.setOfflineMode(offline);
} catch (DereferenceConfigurationException e) {
StringBuilder message = new StringBuilder("Unsupported Derefernece Configuarion ");
if (e.getProperty() != null) {
message.append("for property '").append(e.getProperty()).append("' ");
}
message.append(" parsed via the EnhancementProperties of this request!");
throw new EnhancementPropertyException(this, ci, e.getProperty(), message.toString(), e);
}
// parse the referenced entities from the graph
// (2) read all Entities we need to dereference from the parsed contentItem
Set<IRI> checked = new HashSet<IRI>();
// since STANBOL-1334 the list of properties that refer to entities can be configured
for (IRI referenceProperty : derefContext.getEntityReferences()) {
Iterator<Triple> entityReferences = metadata.filter(null, referenceProperty, null);
while (entityReferences.hasNext()) {
Triple triple = entityReferences.next();
RDFTerm entityReference = triple.getObject();
if (// only URIs
(entityReference instanceof IRI) && // do not check a URI twice
checked.add((IRI) entityReference) && // fallback mode
chekcFallbackMode((IRI) entityReference, metadata) && checkURI((IRI) entityReference)) {
// URI prefixes and patterns
boolean added = referencedEntities.add((IRI) entityReference);
if (added && log.isTraceEnabled()) {
log.trace(" ... schedule Entity {} (referenced-by: {})", entityReference, referenceProperty);
}
} else if (log.isTraceEnabled()) {
log.trace(" ... ignore Entity {} (referenced-by: {})", entityReference, referenceProperty);
}
}
}
} finally {
ci.getLock().readLock().unlock();
}
long schedule = System.nanoTime();
final Lock writeLock = ci.getLock().writeLock();
log.trace(" - scheduled {} Entities for dereferencing", referencedEntities.size());
// (2) dereference the Entities
ExecutorService executor = dereferencer.getExecutor();
Set<IRI> failedEntities = new HashSet<IRI>();
int dereferencedCount = 0;
List<DereferenceJob> dereferenceJobs = new ArrayList<DereferenceJob>(referencedEntities.size());
if (executor != null && !executor.isShutdown()) {
// schedule all entities to dereference
for (final IRI entity : referencedEntities) {
DereferenceJob dereferenceJob = new DereferenceJob(entity, metadata, writeLock, derefContext);
dereferenceJob.setFuture(executor.submit(dereferenceJob));
dereferenceJobs.add(dereferenceJob);
}
// wait for all entities to be dereferenced
for (DereferenceJob dereferenceJob : dereferenceJobs) {
try {
if (dereferenceJob.await()) {
dereferencedCount++;
}
} catch (InterruptedException e) {
// Restore the interrupted status
Thread.currentThread().interrupt();
throw new EngineException(this, ci, "Interupted while waiting for dereferencing Entities", e);
} catch (ExecutionException e) {
if (e.getCause() instanceof DereferenceException) {
failedEntities.add(dereferenceJob.entity);
log.debug(" ... error while dereferencing " + dereferenceJob.entity + "!", e);
} else {
// unknown error
throw new EngineException(this, ci, "Unchecked Error while " + "dereferencing Entity " + dereferenceJob.entity + "!", e);
}
}
}
} else {
// dereference using the current thread
for (IRI entity : referencedEntities) {
try {
log.trace(" ... dereference {}", entity);
if (dereferencer.dereference(entity, metadata, writeLock, derefContext)) {
dereferencedCount++;
log.trace(" + success");
} else {
log.trace(" - not found");
}
} catch (DereferenceException e) {
log.debug(" ... error while dereferencing " + entity + "!", e);
failedEntities.add(entity);
}
}
}
long end = System.nanoTime();
float sheduleDuration = ((schedule - start) / 10000) / 100f;
float dereferenceDuration = ((end - schedule) / 10000) / 100f;
float duration = ((end - start) / 10000) / 100f;
if (!failedEntities.isEmpty()) {
log.warn(" - unable to dereference {} of {} for ContentItem {}", new Object[] { failedEntities.size(), referencedEntities.size(), ci.getUri() });
}
if (log.isDebugEnabled() && dereferencedCount > 0) {
log.debug(" - dereferenced {} of {} Entities in {}ms | schedule:{}ms | " + " dereference: {}ms ({}ms/entity)", new Object[] { dereferencedCount, referencedEntities.size(), duration, sheduleDuration, dereferenceDuration, dereferenceDuration / dereferencedCount });
}
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class DBPSpotlightSpotEnhancementEngine method doPostRequest.
/**
* Sends a POST request to the DBpediaSpotlight url.
*
* @param text
* a <code>String</code> with the text to be analyzed
* @param contentItemUri
* the URI of the ContentItem (only used for logging)
* @return a <code>String</code> with the server response
* @throws EngineException
* if the request cannot be sent
*/
protected Collection<SurfaceForm> doPostRequest(String text, IRI contentItemUri) throws EngineException {
// rwesten: reimplemented this so that the request
// is directly written to the request instead
// of storing the data in an in-memory StringBuilder
HttpURLConnection connection = null;
BufferedWriter wr = null;
try {
connection = (HttpURLConnection) spotlightUrl.openConnection();
connection.setRequestMethod("POST");
connection.setRequestProperty("Content-Type", "application/x-www-form-urlencoded");
connection.setRequestProperty("Accept", "text/xml");
// set ConnectionTimeout (if configured)
if (connectionTimeout > 0) {
connection.setConnectTimeout(connectionTimeout * 1000);
connection.setReadTimeout(connectionTimeout * 1000);
}
connection.setUseCaches(false);
connection.setDoInput(true);
connection.setDoOutput(true);
// Send request
wr = new BufferedWriter(new OutputStreamWriter(connection.getOutputStream(), UTF8));
} catch (IOException e) {
IOUtils.closeQuietly(wr);
throw new EngineException("Unable to open connection to " + spotlightUrl, e);
}
try {
if (spotlightSpotter != null && !spotlightSpotter.isEmpty()) {
wr.write("spotter=");
wr.write(URLEncoder.encode(spotlightSpotter, UTF8.name()));
wr.write('&');
}
wr.write("text=");
// now append the URL encoded text
// TODO: This will load the URLEncoded variant in-memory.
// One could avoid that by encoding the data in smaller
// pieces, but using URLEncoding for big data is anyway
// very inefficient. So instead of fixing this issue here
// DBpedia Spotlight should support "multipart/from-data"
// instead.
// As soon as this is supported this should be re-implemented
// to support streaming.
wr.write(URLEncoder.encode(text, UTF8.name()));
} catch (UnsupportedEncodingException e) {
throw new IllegalStateException("The platform does not support encoding " + UTF8.name(), e);
} catch (IOException e) {
throw new EngineException("Unable to write 'plain/text' content " + "for ContentItem " + contentItemUri + " to " + spotlightUrl, e);
} finally {
IOUtils.closeQuietly(wr);
}
// rwesten: reimplemented this to read the XML
// Document directly form the response
InputStream is = null;
Document xmlDoc;
try {
// Get Response
is = connection.getInputStream();
xmlDoc = loadXMLFromInputStream(is);
} catch (IOException e) {
throw new EngineException("Unable to spot Entities with" + "Dbpedia Spotlight Spot RESTful Serice running at " + spotlightUrl, e);
} catch (SAXException e) {
throw new EngineException("Unable to parse Response from " + "Dbpedia Spotlight Spot RESTful Serice running at " + spotlightUrl, e);
} finally {
IOUtils.closeQuietly(is);
}
// connection.disconnect();
return SurfaceForm.parseSurfaceForm(xmlDoc);
}
use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.
the class KuromojiNlpEngine method computeEnhancements.
/**
* Compute enhancements for supplied ContentItem. The results of the process
* are expected to be stored in the metadata of the content item.
* <p/>
* The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
* persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
* <p/>
* This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
* stores it as a new part in the content item. The metadata is not changed.
*
* @throws org.apache.stanbol.enhancer.servicesapi.EngineException
* if the underlying process failed to work as
* expected
*/
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
String language = getLanguage(this, ci, false);
if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
}
// start with the Tokenizer
TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
// build the analyzing chain by adding all TokenFilters
for (TokenFilterFactory filterFactory : filterFactories) {
tokenStream = filterFactory.create(tokenStream);
}
// Try to extract sentences based on POS tags ...
int sentStartOffset = -1;
// NER data
List<NerData> nerList = new ArrayList<NerData>();
// the next index where the NerData.context need to be set
int nerSentIndex = 0;
NerData ner = null;
OffsetAttribute offset = null;
try {
// required with Solr 4
tokenStream.reset();
while (tokenStream.incrementToken()) {
offset = tokenStream.addAttribute(OffsetAttribute.class);
Token token = at.addToken(offset.startOffset(), offset.endOffset());
// Get the POS attribute and init the PosTag
PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = adhocTags.get(posAttr.getPartOfSpeech());
if (posTag == null) {
posTag = new PosTag(posAttr.getPartOfSpeech());
adhocTags.put(posAttr.getPartOfSpeech(), posTag);
log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
}
}
// Sentence detection by POS tag
if (sentStartOffset < 0) {
// the last token was a sentence ending
sentStartOffset = offset.startOffset();
}
if (posTag.hasPos(Pos.Point)) {
Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
// add the sentence as context to the NerData instances
while (nerSentIndex < nerList.size()) {
nerList.get(nerSentIndex).context = sent.getSpan();
nerSentIndex++;
}
sentStartOffset = -1;
}
// POS
token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
// NER
NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
// write NER annotation
Chunk chunk = at.addChunk(ner.start, ner.end);
chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
// NOTE that the fise:TextAnnotation are written later based on the nerList
// clean up
ner = null;
}
if (nerTag != null) {
if (ner == null) {
ner = new NerData(nerTag, offset.startOffset());
nerList.add(ner);
}
ner.end = offset.endOffset();
}
BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
MorphoFeatures morpho = null;
if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
// and add the posTag
morpho.addPos(posTag);
}
InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
inflectionAttr.getInflectionForm();
inflectionAttr.getInflectionType();
if (morpho != null) {
// if present add the morpho
token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
}
}
// we still need to write the last sentence
Sentence lastSent = null;
if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
lastSent = at.addSentence(sentStartOffset, offset.endOffset());
}
// and set the context off remaining named entities
while (nerSentIndex < nerList.size()) {
if (lastSent != null) {
nerList.get(nerSentIndex).context = lastSent.getSpan();
} else {
// no sentence detected
nerList.get(nerSentIndex).context = at.getSpan();
}
nerSentIndex++;
}
} catch (IOException e) {
throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
} finally {
try {
tokenStream.close();
} catch (IOException e) {
/* ignore */
}
}
// finally write the NER annotations to the metadata of the ContentItem
final Graph metadata = ci.getMetadata();
ci.getLock().writeLock().lock();
try {
Language lang = new Language("ja");
for (NerData nerData : nerList) {
IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
}
} finally {
ci.getLock().writeLock().unlock();
}
}
Aggregations