Search in sources :

Example 41 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class EntityLinker method lookupEntities.

/**
 * Searches for Entities in the {@link #entitySearcher} corresponding to the
 * {@link Token#getText() words} of the current {@link #state position} in
 * the text.
 * @param searchStrings the list of {@link Token#getText() words} to search
 * entities for.
 * @return The sorted list with the suggestions.
 * If there are no suggestions an empty list will be returned.
 */
private List<Suggestion> lookupEntities(List<String> searchStrings) throws EngineException {
    Collection<? extends Representation> results;
    try {
        results = entitySearcher.lookup(config.getNameField(), config.getSelectedFields(), searchStrings, state.getSentence().getLanguage(), config.getDefaultLanguage());
    } catch (RuntimeException e) {
        throw new EngineException(e.getMessage(), e);
    }
    List<Suggestion> suggestions = new ArrayList<Suggestion>();
    for (Representation result : results) {
        Suggestion match = matchLabels(result);
        if (match.getMatch() != MATCH.NONE) {
            suggestions.add(match);
        }
    }
    // sort the suggestions
    if (suggestions.size() > 1) {
        Collections.sort(suggestions, Suggestion.DEFAULT_SUGGESTION_COMPARATOR);
    }
    // remove all elements > config.getMaxSuggestions()
    return suggestions;
}
Also used : EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) ArrayList(java.util.ArrayList) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation)

Example 42 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class HtmlExtractorEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    HtmlExtractor extractor = new HtmlExtractor(htmlExtractorRegistry, htmlParser);
    Graph model = new SimpleGraph();
    ci.getLock().readLock().lock();
    try {
        extractor.extract(ci.getUri().getUnicodeString(), ci.getStream(), null, ci.getMimeType(), model);
    } catch (ExtractorException e) {
        throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with HtmlExtractor", e);
    } finally {
        ci.getLock().readLock().unlock();
    }
    ClerezzaRDFUtils.urifyBlankNodes(model);
    // make the model single rooted
    if (singleRootRdf) {
        ClerezzaRDFUtils.makeConnected(model, ci.getUri(), new IRI(NIE_NS + "contains"));
    }
    // add the extracted triples to the metadata of the ContentItem
    ci.getLock().writeLock().lock();
    try {
        LOG.info("Model: {}", model);
        ci.getMetadata().addAll(model);
        model = null;
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) ExtractorException(org.apache.stanbol.enhancer.engines.htmlextractor.impl.ExtractorException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) HtmlExtractor(org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor)

Example 43 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class NamedEntityTaggingEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    final Site site;
    if (referencedSiteID != null) {
        // lookup the referenced site
        site = siteManager.getSite(referencedSiteID);
        // ensure that it is present
        if (site == null) {
            String msg = String.format("Unable to enhance %s because Referenced Site %s is currently not active!", ci.getUri().getUnicodeString(), referencedSiteID);
            log.warn(msg);
            // throw new EngineException(msg);
            return;
        }
        // and that it supports offline mode if required
        if (isOfflineMode() && !site.supportsLocalMode()) {
            log.warn("Unable to enhance ci {} because OfflineMode is not supported by ReferencedSite {}.", ci.getUri().getUnicodeString(), site.getId());
            return;
        }
    } else {
        // null indicates to use the Entityhub to lookup Entities
        site = null;
    }
    Graph graph = ci.getMetadata();
    LiteralFactory literalFactory = LiteralFactory.getInstance();
    // Retrieve the existing text annotations (requires read lock)
    Map<NamedEntity, List<IRI>> textAnnotations = new HashMap<NamedEntity, List<IRI>>();
    // the language extracted for the parsed content or NULL if not
    // available
    String contentLangauge;
    ci.getLock().readLock().lock();
    try {
        contentLangauge = EnhancementEngineHelper.getLanguage(ci);
        for (Iterator<Triple> it = graph.filter(null, RDF_TYPE, TechnicalClasses.ENHANCER_TEXTANNOTATION); it.hasNext(); ) {
            IRI uri = (IRI) it.next().getSubject();
            if (graph.filter(uri, Properties.DC_RELATION, null).hasNext()) {
                // skip
                continue;
            }
            NamedEntity namedEntity = NamedEntity.createFromTextAnnotation(graph, uri);
            if (namedEntity != null) {
                // This is a first occurrence, collect any subsumed
                // annotations
                List<IRI> subsumed = new ArrayList<IRI>();
                for (Iterator<Triple> it2 = graph.filter(null, Properties.DC_RELATION, uri); it2.hasNext(); ) {
                    subsumed.add((IRI) it2.next().getSubject());
                }
                textAnnotations.put(namedEntity, subsumed);
            }
        }
    } finally {
        ci.getLock().readLock().unlock();
    }
    // search the suggestions
    Map<NamedEntity, List<Suggestion>> suggestions = new HashMap<NamedEntity, List<Suggestion>>(textAnnotations.size());
    for (Entry<NamedEntity, List<IRI>> entry : textAnnotations.entrySet()) {
        try {
            List<Suggestion> entitySuggestions = computeEntityRecommentations(site, entry.getKey(), entry.getValue(), contentLangauge);
            if (entitySuggestions != null && !entitySuggestions.isEmpty()) {
                suggestions.put(entry.getKey(), entitySuggestions);
            }
        } catch (EntityhubException e) {
            throw new EngineException(this, ci, e);
        }
    }
    // now write the results (requires write lock)
    ci.getLock().writeLock().lock();
    try {
        RdfValueFactory factory = RdfValueFactory.getInstance();
        Map<String, Representation> entityData = new HashMap<String, Representation>();
        for (Entry<NamedEntity, List<Suggestion>> entitySuggestions : suggestions.entrySet()) {
            List<IRI> subsumed = textAnnotations.get(entitySuggestions.getKey());
            List<BlankNodeOrIRI> annotationsToRelate = new ArrayList<BlankNodeOrIRI>(subsumed);
            annotationsToRelate.add(entitySuggestions.getKey().getEntity());
            for (Suggestion suggestion : entitySuggestions.getValue()) {
                log.debug("Add Suggestion {} for {}", suggestion.getEntity().getId(), entitySuggestions.getKey());
                EnhancementRDFUtils.writeEntityAnnotation(this, literalFactory, graph, ci.getUri(), annotationsToRelate, suggestion, nameField, // header)?!
                contentLangauge == null ? DEFAULT_LANGUAGE : contentLangauge);
                if (dereferenceEntities) {
                    entityData.put(suggestion.getEntity().getId(), suggestion.getEntity().getRepresentation());
                }
            }
        }
        // Representations to add! If false entityData will be empty
        for (Representation rep : entityData.values()) {
            graph.addAll(factory.toRdfRepresentation(rep).getRdfGraph());
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : Site(org.apache.stanbol.entityhub.servicesapi.site.Site) IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) List(java.util.List) ArrayList(java.util.ArrayList) QueryResultList(org.apache.stanbol.entityhub.servicesapi.query.QueryResultList) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) Triple(org.apache.clerezza.commons.rdf.Triple) Graph(org.apache.clerezza.commons.rdf.Graph) EntityhubException(org.apache.stanbol.entityhub.servicesapi.EntityhubException) RdfValueFactory(org.apache.stanbol.entityhub.model.clerezza.RdfValueFactory)

Example 44 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class TestLocationEnhancementEngine method testLocationEnhancementEngine.

@Test
public void testLocationEnhancementEngine() throws IOException, EngineException {
    // create a content item
    ContentItem ci = getContentItem("urn:org.apache:stanbol.enhancer:text:content-item:person", CONTEXT);
    // add three text annotations to be consumed by this test
    getTextAnnotation(ci, PERSON, CONTEXT, DBPEDIA_PERSON);
    getTextAnnotation(ci, ORGANISATION, CONTEXT, DBPEDIA_ORGANISATION);
    getTextAnnotation(ci, PLACE, CONTEXT, DBPEDIA_PLACE);
    // perform the computation of the enhancements
    try {
        locationEnhancementEngine.computeEnhancements(ci);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e, "overloaded with requests");
        return;
    }
    Map<IRI, RDFTerm> expectedValues = new HashMap<IRI, RDFTerm>();
    expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, ci.getUri());
    expectedValues.put(Properties.DC_CREATOR, LiteralFactory.getInstance().createTypedLiteral(locationEnhancementEngine.getClass().getName()));
    // adding null as expected for confidence makes it a required property
    expectedValues.put(Properties.ENHANCER_CONFIDENCE, null);
    /*
         * Note:
         *  - Expected results depend on the geonames.org data. So if the test
         *    fails it may also mean that the data provided by geonames.org have
         *    changed
         */
    int entityAnnotationCount = validateAllEntityAnnotations(ci.getMetadata(), expectedValues);
// two suggestions for New Zealand and one hierarchy entry for the first
// suggestion
// NOTE 2012-10-10: changed expected value back to "3" as geonames.org
// again returns "Oceania" as parent for "New Zealand"
// NOTE: 2012-11-12: deactivated this check, because this the fact that
// "Oceania" is returned as parent for "New Zealand" changes every
// every view weeks
// assertEquals(3, entityAnnotationCount);
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) HashMap(java.util.HashMap) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 45 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class RefactorEnhancementEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    // Prepare the OntoNet environment. First we create the OntoNet session in which run the whole
    final Session session;
    try {
        session = sessionManager.createSession();
    } catch (SessionLimitException e1) {
        throw new EngineException("OntoNet session quota reached. The Refactor Engine requires its own new session to execute.");
    }
    if (session == null)
        throw new EngineException("Failed to create OntoNet session. The Refactor Engine requires its own new session to execute.");
    log.debug("Refactor enhancement job will run in session '{}'.", session.getID());
    // Retrieve and filter the metadata graph for entities recognized by the engines.
    final Graph metadataGraph = ci.getMetadata(), signaturesGraph = new IndexedGraph();
    // FIXME the Stanbol Enhancer vocabulary should be retrieved from somewhere in the enhancer API.
    final IRI ENHANCER_ENTITY_REFERENCE = new IRI("http://fise.iks-project.eu/ontology/entity-reference");
    Iterator<Triple> tripleIt = metadataGraph.filter(null, ENHANCER_ENTITY_REFERENCE, null);
    while (tripleIt.hasNext()) {
        // Get the entity URI
        RDFTerm obj = tripleIt.next().getObject();
        if (!(obj instanceof IRI)) {
            log.warn("Invalid IRI for entity reference {}. Skipping.", obj);
            continue;
        }
        final String entityReference = ((IRI) obj).getUnicodeString();
        log.debug("Trying to resolve entity {}", entityReference);
        // Populate the entity signatures graph, by querying either the Entity Hub or the dereferencer.
        if (engineConfiguration.isEntityHubUsed()) {
            Graph result = populateWithEntity(entityReference, signaturesGraph);
            if (result != signaturesGraph && result != null) {
                log.warn("Entity Hub query added triples to a new graph instead of populating the supplied one!" + " New signatures will be discarded.");
            }
        } else
            try {
                OntologyInputSource<Graph> source = new GraphContentSourceWithPhysicalIRI(dereferencer.resolve(entityReference), org.semanticweb.owlapi.model.IRI.create(entityReference));
                signaturesGraph.addAll(source.getRootOntology());
            } catch (FileNotFoundException e) {
                log.error("Failed to dereference entity " + entityReference + ". Skipping.", e);
                continue;
            }
    }
    try {
        /*
             * The dedicated session for this job will store the following: (1) all the (merged) signatures
             * for all detected entities; (2) the original content metadata graph returned earlier in the
             * chain.
             * 
             * There is no chance that (2) could be null, as it was previously controlled by the JobManager
             * through the canEnhance() method and the computeEnhancement is always called iff the former
             * returns true.
             */
        session.addOntology(new GraphSource(signaturesGraph));
        session.addOntology(new GraphSource(metadataGraph));
    } catch (UnmodifiableOntologyCollectorException e1) {
        throw new EngineException("Cannot add enhancement graph to OntoNet session for refactoring", e1);
    }
    try {
        /*
             * Export the entire session (incl. entities and enhancement graph) as a single merged ontology.
             * 
             * TODO the refactorer should have methods to accommodate an OntologyCollector directly instead.
             */
        OWLOntology ontology = session.export(OWLOntology.class, true);
        log.debug("Refactoring recipe IRI is : " + engineConfiguration.getRecipeId());
        /*
             * We pass the ontology and the recipe IRI to the Refactor that returns the refactored graph
             * expressed by using the given vocabulary.
             * 
             * To perform the refactoring of the ontology to a given vocabulary we use the Stanbol Refactor.
             */
        Recipe recipe = ruleStore.getRecipe(new IRI(engineConfiguration.getRecipeId()));
        log.debug("Recipe {} contains {} rules.", recipe, recipe.getRuleList().size());
        log.debug("The ontology to be refactor is {}", ontology);
        Graph tc = refactorer.graphRefactoring(OWLAPIToClerezzaConverter.owlOntologyToClerezzaGraph(ontology), recipe);
        /*
             * The newly generated ontology is converted to Clarezza format and then added os substitued to
             * the old mGraph.
             */
        if (engineConfiguration.isInGraphAppendMode()) {
            log.debug("Metadata of the content will replace old ones.", this);
        } else {
            metadataGraph.clear();
            log.debug("Content metadata will be appended to the existing ones.", this);
        }
        metadataGraph.addAll(tc);
    } catch (RefactoringException e) {
        String msg = "Refactor engine execution failed on content item " + ci + ".";
        log.error(msg, e);
        throw new EngineException(msg, e);
    } catch (NoSuchRecipeException e) {
        String msg = "Refactor engine could not find recipe " + engineConfiguration.getRecipeId() + " to refactor content item " + ci + ".";
        log.error(msg, e);
        throw new EngineException(msg, e);
    } catch (Exception e) {
        throw new EngineException("Refactor Engine has failed.", e);
    } finally {
        /*
             * The session needs to be destroyed anyhow.
             * 
             * Clear contents before destroying (FIXME only do this until this is implemented in the
             * destroySession() method).
             */
        for (OWLOntologyID id : session.listManagedOntologies()) {
            try {
                String key = ontologyProvider.getKey(id.getOntologyIRI());
                ontologyProvider.getStore().deleteGraph(new IRI(key));
            } catch (Exception ex) {
                log.error("Failed to delete triple collection " + id, ex);
                continue;
            }
        }
        sessionManager.destroySession(session.getID());
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Recipe(org.apache.stanbol.rules.base.api.Recipe) NoSuchRecipeException(org.apache.stanbol.rules.base.api.NoSuchRecipeException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) FileNotFoundException(java.io.FileNotFoundException) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) SessionLimitException(org.apache.stanbol.ontologymanager.servicesapi.session.SessionLimitException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) OWLOntologyCreationException(org.semanticweb.owlapi.model.OWLOntologyCreationException) RecipeConstructionException(org.apache.stanbol.rules.base.api.RecipeConstructionException) ConfigurationException(org.osgi.service.cm.ConfigurationException) RefactoringException(org.apache.stanbol.rules.refactor.api.RefactoringException) FileNotFoundException(java.io.FileNotFoundException) RecipeEliminationException(org.apache.stanbol.rules.base.api.RecipeEliminationException) NoSuchRecipeException(org.apache.stanbol.rules.base.api.NoSuchRecipeException) SessionLimitException(org.apache.stanbol.ontologymanager.servicesapi.session.SessionLimitException) UnmodifiableOntologyCollectorException(org.apache.stanbol.ontologymanager.servicesapi.collector.UnmodifiableOntologyCollectorException) AlreadyExistingRecipeException(org.apache.stanbol.rules.base.api.AlreadyExistingRecipeException) DuplicateIDException(org.apache.stanbol.ontologymanager.servicesapi.collector.DuplicateIDException) IOException(java.io.IOException) Triple(org.apache.clerezza.commons.rdf.Triple) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) Graph(org.apache.clerezza.commons.rdf.Graph) UnmodifiableOntologyCollectorException(org.apache.stanbol.ontologymanager.servicesapi.collector.UnmodifiableOntologyCollectorException) OWLOntology(org.semanticweb.owlapi.model.OWLOntology) OWLOntologyID(org.semanticweb.owlapi.model.OWLOntologyID) GraphSource(org.apache.stanbol.ontologymanager.sources.clerezza.GraphSource) OntologyInputSource(org.apache.stanbol.ontologymanager.servicesapi.io.OntologyInputSource) RefactoringException(org.apache.stanbol.rules.refactor.api.RefactoringException) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) Session(org.apache.stanbol.ontologymanager.servicesapi.session.Session)

Aggregations

EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)55 IRI (org.apache.clerezza.commons.rdf.IRI)37 IOException (java.io.IOException)33 Graph (org.apache.clerezza.commons.rdf.Graph)24 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)23 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)20 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)15 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)15 HashMap (java.util.HashMap)13 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)13 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)12 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)10 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)10 Test (org.junit.Test)10 Triple (org.apache.clerezza.commons.rdf.Triple)9 InputStream (java.io.InputStream)8 SOAPException (javax.xml.soap.SOAPException)8 Token (org.apache.stanbol.enhancer.nlp.model.Token)8 Language (org.apache.clerezza.commons.rdf.Language)7 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)7