Search in sources :

Example 51 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class KuromojiNlpEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     * <p/>
     * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
     * stores it as a new part in the content item. The metadata is not changed.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    final AnalysedText at = initAnalysedText(this, analysedTextFactory, ci);
    String language = getLanguage(this, ci, false);
    if (!("ja".equals(language) || (language != null && language.startsWith("ja-")))) {
        throw new IllegalStateException("The detected language is NOT 'ja'! " + "As this is also checked within the #canEnhance(..) method this " + "indicates an Bug in the used EnhancementJobManager implementation. " + "Please report this on the dev@apache.stanbol.org or create an " + "JIRA issue about this.");
    }
    //start with the Tokenizer
    TokenStream tokenStream = tokenizerFactory.create(new CharSequenceReader(at.getText()));
    //build the analyzing chain by adding all TokenFilters
    for (TokenFilterFactory filterFactory : filterFactories) {
        tokenStream = filterFactory.create(tokenStream);
    }
    //Try to extract sentences based on POS tags ...
    int sentStartOffset = -1;
    //NER data
    List<NerData> nerList = new ArrayList<NerData>();
    //the next index where the NerData.context need to be set
    int nerSentIndex = 0;
    NerData ner = null;
    OffsetAttribute offset = null;
    try {
        //required with Solr 4
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            offset = tokenStream.addAttribute(OffsetAttribute.class);
            Token token = at.addToken(offset.startOffset(), offset.endOffset());
            //Get the POS attribute and init the PosTag
            PartOfSpeechAttribute posAttr = tokenStream.addAttribute(PartOfSpeechAttribute.class);
            PosTag posTag = POS_TAG_SET.getTag(posAttr.getPartOfSpeech());
            if (posTag == null) {
                posTag = adhocTags.get(posAttr.getPartOfSpeech());
                if (posTag == null) {
                    posTag = new PosTag(posAttr.getPartOfSpeech());
                    adhocTags.put(posAttr.getPartOfSpeech(), posTag);
                    log.warn(" ... missing PosTag mapping for {}", posAttr.getPartOfSpeech());
                }
            }
            //Sentence detection by POS tag
            if (sentStartOffset < 0) {
                //the last token was a sentence ending
                sentStartOffset = offset.startOffset();
            }
            if (posTag.hasPos(Pos.Point)) {
                Sentence sent = at.addSentence(sentStartOffset, offset.startOffset());
                //add the sentence as context to the NerData instances
                while (nerSentIndex < nerList.size()) {
                    nerList.get(nerSentIndex).context = sent.getSpan();
                    nerSentIndex++;
                }
                sentStartOffset = -1;
            }
            //POS
            token.addAnnotation(POS_ANNOTATION, Value.value(posTag));
            //NER
            NerTag nerTag = NER_TAG_SET.getTag(posAttr.getPartOfSpeech());
            if (ner != null && (nerTag == null || !ner.tag.getType().equals(nerTag.getType()))) {
                //write NER annotation
                Chunk chunk = at.addChunk(ner.start, ner.end);
                chunk.addAnnotation(NlpAnnotations.NER_ANNOTATION, Value.value(ner.tag));
                //NOTE that the fise:TextAnnotation are written later based on the nerList
                //clean up
                ner = null;
            }
            if (nerTag != null) {
                if (ner == null) {
                    ner = new NerData(nerTag, offset.startOffset());
                    nerList.add(ner);
                }
                ner.end = offset.endOffset();
            }
            BaseFormAttribute baseFormAttr = tokenStream.addAttribute(BaseFormAttribute.class);
            MorphoFeatures morpho = null;
            if (baseFormAttr != null && baseFormAttr.getBaseForm() != null) {
                morpho = new MorphoFeatures(baseFormAttr.getBaseForm());
                //and add the posTag
                morpho.addPos(posTag);
            }
            InflectionAttribute inflectionAttr = tokenStream.addAttribute(InflectionAttribute.class);
            inflectionAttr.getInflectionForm();
            inflectionAttr.getInflectionType();
            if (morpho != null) {
                //if present add the morpho
                token.addAnnotation(MORPHO_ANNOTATION, Value.value(morpho));
            }
        }
        //we still need to write the last sentence
        Sentence lastSent = null;
        if (offset != null && sentStartOffset >= 0 && offset.endOffset() > sentStartOffset) {
            lastSent = at.addSentence(sentStartOffset, offset.endOffset());
        }
        //and set the context off remaining named entities
        while (nerSentIndex < nerList.size()) {
            if (lastSent != null) {
                nerList.get(nerSentIndex).context = lastSent.getSpan();
            } else {
                //no sentence detected
                nerList.get(nerSentIndex).context = at.getSpan();
            }
            nerSentIndex++;
        }
    } catch (IOException e) {
        throw new EngineException(this, ci, "Exception while reading from " + "AnalyzedText contentpart", e);
    } finally {
        try {
            tokenStream.close();
        } catch (IOException e) {
        /* ignore */
        }
    }
    //finally write the NER annotations to the metadata of the ContentItem
    final Graph metadata = ci.getMetadata();
    ci.getLock().writeLock().lock();
    try {
        Language lang = new Language("ja");
        for (NerData nerData : nerList) {
            IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
            metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(at.getSpan().substring(nerData.start, nerData.end), lang)));
            metadata.add(new TripleImpl(ta, DC_TYPE, nerData.tag.getType()));
            metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(nerData.start)));
            metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(nerData.end)));
            metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(nerData.context, lang)));
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) IRI(org.apache.clerezza.commons.rdf.IRI) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Token(org.apache.stanbol.enhancer.nlp.model.Token) NlpEngineHelper.initAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.initAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) CharSequenceReader(org.apache.commons.io.input.CharSequenceReader) PosTag(org.apache.stanbol.enhancer.nlp.pos.PosTag) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) BaseFormAttribute(org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) MorphoFeatures(org.apache.stanbol.enhancer.nlp.morpho.MorphoFeatures) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) InflectionAttribute(org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) PartOfSpeechAttribute(org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute) IOException(java.io.IOException) Chunk(org.apache.stanbol.enhancer.nlp.model.Chunk) TokenFilterFactory(org.apache.lucene.analysis.util.TokenFilterFactory) Graph(org.apache.clerezza.commons.rdf.Graph) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute)

Example 52 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class TestLocationEnhancementEngine method testLocationEnhancementEngine.

@Test
public void testLocationEnhancementEngine() throws IOException, EngineException {
    //create a content item
    ContentItem ci = getContentItem("urn:org.apache:stanbol.enhancer:text:content-item:person", CONTEXT);
    //add three text annotations to be consumed by this test
    getTextAnnotation(ci, PERSON, CONTEXT, DBPEDIA_PERSON);
    getTextAnnotation(ci, ORGANISATION, CONTEXT, DBPEDIA_ORGANISATION);
    getTextAnnotation(ci, PLACE, CONTEXT, DBPEDIA_PLACE);
    //perform the computation of the enhancements
    try {
        locationEnhancementEngine.computeEnhancements(ci);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e, "overloaded with requests");
        return;
    }
    Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
    expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
    expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(locationEnhancementEngine.getClass().getName()));
    //adding null as expected for confidence makes it a required property
    expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
    /*
         * Note:
         *  - Expected results depend on the geonames.org data. So if the test
         *    fails it may also mean that the data provided by geonames.org have
         *    changed
         */
    int entityAnnotationCount = validateAllEntityAnnotations(ci.getMetadata(), expectedValues);
//two suggestions for New Zealand and one hierarchy entry for the first
//suggestion
//NOTE 2012-10-10: changed expected value back to "3" as geonames.org
//   again returns "Oceania" as parent for "New Zealand"
//NOTE: 2012-11-12: deactivated this check, because this the fact that
//   "Oceania" is returned as parent for "New Zealand" changes every
//   every view weeks
//assertEquals(3, entityAnnotationCount);
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) HashMap(java.util.HashMap) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 53 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class HtmlExtractorEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    HtmlExtractor extractor = new HtmlExtractor(htmlExtractorRegistry, htmlParser);
    Graph model = new SimpleGraph();
    ci.getLock().readLock().lock();
    try {
        extractor.extract(ci.getUri().getUnicodeString(), ci.getStream(), null, ci.getMimeType(), model);
    } catch (ExtractorException e) {
        throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with HtmlExtractor", e);
    } finally {
        ci.getLock().readLock().unlock();
    }
    ClerezzaRDFUtils.urifyBlankNodes(model);
    // make the model single rooted
    if (singleRootRdf) {
        ClerezzaRDFUtils.makeConnected(model, ci.getUri(), new IRI(NIE_NS + "contains"));
    }
    //add the extracted triples to the metadata of the ContentItem
    ci.getLock().writeLock().lock();
    try {
        LOG.info("Model: {}", model);
        ci.getMetadata().addAll(model);
        model = null;
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) ExtractorException(org.apache.stanbol.enhancer.engines.htmlextractor.impl.ExtractorException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) HtmlExtractor(org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor)

Example 54 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class NamedEntityTaggingEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    final Site site;
    if (referencedSiteID != null) {
        // lookup the referenced site
        site = siteManager.getSite(referencedSiteID);
        // ensure that it is present
        if (site == null) {
            String msg = String.format("Unable to enhance %s because Referenced Site %s is currently not active!", ci.getUri().getUnicodeString(), referencedSiteID);
            log.warn(msg);
            // throw new EngineException(msg);
            return;
        }
        // and that it supports offline mode if required
        if (isOfflineMode() && !site.supportsLocalMode()) {
            log.warn("Unable to enhance ci {} because OfflineMode is not supported by ReferencedSite {}.", ci.getUri().getUnicodeString(), site.getId());
            return;
        }
    } else {
        // null indicates to use the Entityhub to lookup Entities
        site = null;
    }
    Graph graph = ci.getMetadata();
    LiteralFactory literalFactory = LiteralFactory.getInstance();
    // Retrieve the existing text annotations (requires read lock)
    Map<NamedEntity, List<IRI>> textAnnotations = new HashMap<NamedEntity, List<IRI>>();
    // the language extracted for the parsed content or NULL if not
    // available
    String contentLangauge;
    ci.getLock().readLock().lock();
    try {
        contentLangauge = EnhancementEngineHelper.getLanguage(ci);
        for (Iterator<Triple> it = graph.filter(null, RDF_TYPE, TechnicalClasses.ENHANCER_TEXTANNOTATION); it.hasNext(); ) {
            IRI uri = (IRI) it.next().getSubject();
            if (graph.filter(uri, Properties.DC_RELATION, null).hasNext()) {
                // skip
                continue;
            }
            NamedEntity namedEntity = NamedEntity.createFromTextAnnotation(graph, uri);
            if (namedEntity != null) {
                // This is a first occurrence, collect any subsumed
                // annotations
                List<IRI> subsumed = new ArrayList<IRI>();
                for (Iterator<Triple> it2 = graph.filter(null, Properties.DC_RELATION, uri); it2.hasNext(); ) {
                    subsumed.add((IRI) it2.next().getSubject());
                }
                textAnnotations.put(namedEntity, subsumed);
            }
        }
    } finally {
        ci.getLock().readLock().unlock();
    }
    // search the suggestions
    Map<NamedEntity, List<Suggestion>> suggestions = new HashMap<NamedEntity, List<Suggestion>>(textAnnotations.size());
    for (Entry<NamedEntity, List<IRI>> entry : textAnnotations.entrySet()) {
        try {
            List<Suggestion> entitySuggestions = computeEntityRecommentations(site, entry.getKey(), entry.getValue(), contentLangauge);
            if (entitySuggestions != null && !entitySuggestions.isEmpty()) {
                suggestions.put(entry.getKey(), entitySuggestions);
            }
        } catch (EntityhubException e) {
            throw new EngineException(this, ci, e);
        }
    }
    // now write the results (requires write lock)
    ci.getLock().writeLock().lock();
    try {
        RdfValueFactory factory = RdfValueFactory.getInstance();
        Map<String, Representation> entityData = new HashMap<String, Representation>();
        for (Entry<NamedEntity, List<Suggestion>> entitySuggestions : suggestions.entrySet()) {
            List<IRI> subsumed = textAnnotations.get(entitySuggestions.getKey());
            List<BlankNodeOrIRI> annotationsToRelate = new ArrayList<BlankNodeOrIRI>(subsumed);
            annotationsToRelate.add(entitySuggestions.getKey().getEntity());
            for (Suggestion suggestion : entitySuggestions.getValue()) {
                log.debug("Add Suggestion {} for {}", suggestion.getEntity().getId(), entitySuggestions.getKey());
                EnhancementRDFUtils.writeEntityAnnotation(this, literalFactory, graph, ci.getUri(), annotationsToRelate, suggestion, nameField, // header)?!
                contentLangauge == null ? DEFAULT_LANGUAGE : contentLangauge);
                if (dereferenceEntities) {
                    entityData.put(suggestion.getEntity().getId(), suggestion.getEntity().getRepresentation());
                }
            }
        }
        // Representations to add! If false entityData will be empty
        for (Representation rep : entityData.values()) {
            graph.addAll(factory.toRdfRepresentation(rep).getRdfGraph());
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : Site(org.apache.stanbol.entityhub.servicesapi.site.Site) IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) List(java.util.List) ArrayList(java.util.ArrayList) QueryResultList(org.apache.stanbol.entityhub.servicesapi.query.QueryResultList) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) Triple(org.apache.clerezza.commons.rdf.Triple) Graph(org.apache.clerezza.commons.rdf.Graph) EntityhubException(org.apache.stanbol.entityhub.servicesapi.EntityhubException) RdfValueFactory(org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory)

Example 55 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class NlpEngineHelper method initAnalysedText.

/**
     * Retrieves - or if not present - creates the {@link AnalysedText} content
     * part for the parsed {@link ContentItem}. If the {@link Blob} with the
     * mime type '<code>text/plain</code>' is present this method
     * throws an {@link IllegalStateException} (this method internally uses
     * {@link #getPlainText(EnhancementEngine, ContentItem, boolean)} with
     * <code>true</code> as third parameters. Users of this method should call
     * this method with <code>false</code> as third parameter in their 
     * {@link EnhancementEngine#canEnhance(ContentItem)} implementation.<p>
     * <i>NOTE:</i> This method is intended for Engines that want to create an
     * empty {@link AnalysedText} content part. Engines that assume that this
     * content part is already present (e.g. if the consume already existing
     * annotations) should use the 
     * {@link #getAnalysedText(EnhancementEngine, ContentItem, boolean)}
     * method instead.
     * @param engine the EnhancementEngine calling this method (used for logging)
     * @param analysedTextFactory the {@link AnalysedTextFactory} used to create
     * the {@link AnalysedText} instance (if not present).
     * @param ci the {@link ContentItem}
     * @return the AnalysedText
     * @throws EngineException on any exception while accessing the 
     * '<code>text/plain</code>' Blob
     * @throws IllegalStateException if no '<code>text/plain</code>' Blob is
     * present as content part of the parsed {@link ContentItem} or the parsed
     * {@link AnalysedTextFactory} is <code>null</code>. <i>NOTE</i> that 
     * {@link IllegalStateException} are only thrown if the {@link AnalysedText}
     * ContentPart is not yet present in the parsed {@link ContentItem}
     */
public static AnalysedText initAnalysedText(EnhancementEngine engine, AnalysedTextFactory analysedTextFactory, ContentItem ci) throws EngineException {
    AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
    if (at == null) {
        if (analysedTextFactory == null) {
            throw new IllegalStateException("Unable to initialise AnalysedText" + "ContentPart because the parsed AnalysedTextFactory is NULL");
        }
        Entry<IRI, Blob> textBlob = getPlainText(engine, ci, true);
        //we need to create
        ci.getLock().writeLock().lock();
        try {
            //try again to retrieve (maybe an concurrent thread has created
            //the content part in the meantime
            at = AnalysedTextUtils.getAnalysedText(ci);
            if (at == null) {
                log.debug(" ... create new AnalysedText instance for Engine {}", engine.getName());
                at = analysedTextFactory.createAnalysedText(ci, textBlob.getValue());
            }
        } catch (IOException e) {
            throw new EngineException("Unable to create AnalysetText instance for Blob " + textBlob.getKey() + " of ContentItem " + ci.getUri() + "!", e);
        } finally {
            ci.getLock().writeLock().unlock();
        }
    } else {
        log.debug(" ... use existing AnalysedText instance for Engine {}", engine.getName());
    }
    return at;
}
Also used : AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException)

Aggregations

EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)55 IRI (org.apache.clerezza.commons.rdf.IRI)37 IOException (java.io.IOException)33 Graph (org.apache.clerezza.commons.rdf.Graph)24 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)23 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)20 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)15 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)15 HashMap (java.util.HashMap)13 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)13 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)12 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)10 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)10 Test (org.junit.Test)10 Triple (org.apache.clerezza.commons.rdf.Triple)9 InputStream (java.io.InputStream)8 SOAPException (javax.xml.soap.SOAPException)8 Token (org.apache.stanbol.enhancer.nlp.model.Token)8 Language (org.apache.clerezza.commons.rdf.Language)7 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)7