Search in sources :

Example 6 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class TikaEngineTest method testRtf.

@Test
public void testRtf() throws EngineException, IOException {
    log.info(">>> testRtf <<<");
    ContentItem ci = createContentItem("test.rtf", "application/rtf");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
    //validate XHTML results
    contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
    assertNotNull(contentPart);
    Blob xhtmlBlob = contentPart.getValue();
    assertNotNull(xhtmlBlob);
    assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<title>", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 7 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class TikaEngineTest method testContentTypeDetection.

@Test
public void testContentTypeDetection() throws EngineException, IOException {
    log.info(">>> testContentTypeDetection <<<");
    ContentItem ci = createContentItem("test.pdf", OCTET_STREAM.toString());
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities");
    //validate XHTML results
    contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
    assertNotNull(contentPart);
    Blob xhtmlBlob = contentPart.getValue();
    assertNotNull(xhtmlBlob);
    assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<head>", "<meta name=", "<div class=\"page\">", "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities", "</body></html>");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 8 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class TikaEngineTest method testUnsupported.

@Test
public void testUnsupported() throws EngineException, IOException {
    log.info(">>> testUnsupported <<<");
    ContentItem ci = createContentItem("test.pages", "application/x-iwork-pages-sffpages");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    //it MUST NOT give an error but also not add a content part
    assertNull(contentPart);
    //only the original content
    assertEquals(1, ContentItemHelper.getContentParts(ci, Blob.class).size());
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 9 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class TopicClassificationEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with a supported Mime Type" + "found for ContentItem " + ci.getUri() + "(supported: '" + SUPPORTED_MIMETYPES + "') -> this indicates that canEnhance was" + "NOT called and indicates a bug in the used EnhancementJobManager!");
    }
    String language = EnhancementEngineHelper.getLanguage(ci);
    if (!(acceptedLanguageSet.isEmpty() || acceptedLanguageSet.contains(language) || acceptedLanguageSet.contains(""))) {
        throw new IllegalStateException("The language '" + language + "' of the ContentItem is not configured as " + " active for this Engine (active: " + acceptedLanguageSet + ").");
    }
    String text;
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(String.format("Unable to extract " + " textual content from ContentPart %s of ContentItem %s!", contentPart.getKey(), ci.getUri()), e);
    }
    if (text.trim().isEmpty()) {
        log.warn("ContentPart {} of ContentItem {} does not contain any " + "text to extract topics from", contentPart.getKey(), ci.getUri());
        return;
    }
    Graph metadata = ci.getMetadata();
    List<TopicSuggestion> topics;
    try {
        topics = suggestTopics(text);
        if (topics.isEmpty()) {
            return;
        }
    } catch (ClassifierException e) {
        throw new EngineException(e);
    }
    IRI precision = new IRI(NamespaceEnum.fise + "classifier/precision");
    IRI recall = new IRI(NamespaceEnum.fise + "classifier/recall");
    IRI f1 = new IRI(NamespaceEnum.fise + "classifier/f1");
    LiteralFactory lf = LiteralFactory.getInstance();
    ci.getLock().writeLock().lock();
    try {
        // Global text annotation to attach all the topic annotation to it.
        IRI textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
        metadata.add(new TripleImpl(textAnnotation, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE, OntologicalClasses.SKOS_CONCEPT));
        for (TopicSuggestion topic : topics) {
            IRI enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this);
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, TechnicalClasses.ENHANCER_TOPICANNOTATION));
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION, textAnnotation));
            // add link to entity
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, new IRI(topic.conceptUri)));
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE, OntologicalClasses.SKOS_CONCEPT));
            // add confidence information
            metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE, lf.createTypedLiteral(Double.valueOf(topic.score))));
            // add performance estimates of the classifier if available
            ClassificationReport perf = getPerformanceEstimates(topic.conceptUri);
            if (perf.uptodate) {
                metadata.add(new TripleImpl(enhancement, precision, lf.createTypedLiteral(Double.valueOf(perf.precision))));
                metadata.add(new TripleImpl(enhancement, recall, lf.createTypedLiteral(Double.valueOf(perf.recall))));
                metadata.add(new TripleImpl(enhancement, f1, lf.createTypedLiteral(Double.valueOf(perf.f1))));
            }
            // fetch concept label from the entityhub or a referenced site if available
            Entity entity = entityhub.getEntity(topic.conceptUri);
            if (entity == null) {
                entity = referencedSiteManager.getEntity(topic.conceptUri);
            }
            if (entity != null) {
                Representation representation = entity.getRepresentation();
                // TODO: extract all languages based on some configuration instead of hardcoding English
                Text label = representation.getFirst(NamespaceEnum.skos + "prefLabel", "en", "en-US", "en-GB");
                if (label == null) {
                    label = representation.getFirst(NamespaceEnum.rdfs + "label", "en", "en-US", "en-GB");
                }
                if (label != null) {
                    metadata.add(new TripleImpl(enhancement, org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL, new PlainLiteralImpl(label.getText())));
                }
            }
        }
    } catch (ClassifierException e) {
        throw new EngineException(e);
    } catch (IllegalArgumentException e) {
        throw new EngineException(e);
    } catch (EntityhubException e) {
        throw new EngineException(e);
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Entity(org.apache.stanbol.entityhub.servicesapi.model.Entity) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Representation(org.apache.stanbol.entityhub.servicesapi.model.Representation) Text(org.apache.stanbol.entityhub.servicesapi.model.Text) IOException(java.io.IOException) TopicSuggestion(org.apache.stanbol.enhancer.topic.api.TopicSuggestion) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) ImmutableGraph(org.apache.clerezza.commons.rdf.ImmutableGraph) Graph(org.apache.clerezza.commons.rdf.Graph) EntityhubException(org.apache.stanbol.entityhub.servicesapi.EntityhubException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) ClassificationReport(org.apache.stanbol.enhancer.topic.api.ClassificationReport) ClassifierException(org.apache.stanbol.enhancer.topic.api.ClassifierException)

Example 10 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class RestfulLangidentEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     * <p/>
     * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
     * stores it as a new part in the content item. The metadata is not changed.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(final ContentItem ci) throws EngineException {
    //get the plain text Blob
    Map.Entry<IRI, Blob> textBlob = getPlainText(this, ci, false);
    Blob blob = textBlob.getValue();
    //send the text to the server
    final HttpPost request = new HttpPost(serviceUrl);
    request.setEntity(new InputStreamEntity(blob.getStream(), blob.getContentLength(), ContentType.create(blob.getMimeType(), blob.getParameter().get("charset"))));
    //execute the request
    List<LangSuggestion> detected;
    try {
        detected = AccessController.doPrivileged(new PrivilegedExceptionAction<List<LangSuggestion>>() {

            public List<LangSuggestion> run() throws ClientProtocolException, IOException {
                return httpClient.execute(request, new LangIdentResponseHandler(ci, objectMapper));
            }
        });
    } catch (PrivilegedActionException pae) {
        Exception e = pae.getException();
        if (e instanceof ClientProtocolException) {
            throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful Language Identification Service at " + serviceUrl, e);
        } else if (e instanceof IOException) {
            throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful Language Identification Service at " + serviceUrl, e);
        } else {
            throw RuntimeException.class.cast(e);
        }
    }
    Graph metadata = ci.getMetadata();
    log.debug("Detected Languages for ContentItem {} and Blob {}");
    ci.getLock().writeLock().lock();
    try {
        //write TextAnnotations for the detected languages
        for (LangSuggestion suggestion : detected) {
            // add a hypothesis
            log.debug(" > {}@{}", suggestion.getLanguage(), suggestion.hasProbability() ? suggestion.getProbability() : "-,--");
            IRI textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
            metadata.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(suggestion.getLanguage())));
            metadata.add(new TripleImpl(textEnhancement, DC_TYPE, DCTERMS_LINGUISTIC_SYSTEM));
            if (suggestion.hasProbability()) {
                metadata.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(suggestion.getProbability())));
            }
        }
    } finally {
        ci.getLock().writeLock().unlock();
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) HttpPost(org.apache.http.client.methods.HttpPost) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PrivilegedActionException(java.security.PrivilegedActionException) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) PrivilegedExceptionAction(java.security.PrivilegedExceptionAction) IOException(java.io.IOException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) URISyntaxException(java.net.URISyntaxException) ConfigurationException(org.osgi.service.cm.ConfigurationException) HttpException(org.apache.http.HttpException) ClientProtocolException(org.apache.http.client.ClientProtocolException) PrivilegedActionException(java.security.PrivilegedActionException) HttpResponseException(org.apache.http.client.HttpResponseException) IOException(java.io.IOException) InputStreamEntity(org.apache.http.entity.InputStreamEntity) ClientProtocolException(org.apache.http.client.ClientProtocolException) Graph(org.apache.clerezza.commons.rdf.Graph) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Map(java.util.Map) HashMap(java.util.HashMap)

Aggregations

Blob (org.apache.stanbol.enhancer.servicesapi.Blob)44 IRI (org.apache.clerezza.commons.rdf.IRI)36 Test (org.junit.Test)21 IOException (java.io.IOException)20 Graph (org.apache.clerezza.commons.rdf.Graph)17 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)15 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)15 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)15 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)14 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)11 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)11 SOAPException (javax.xml.soap.SOAPException)4 Language (org.apache.clerezza.commons.rdf.Language)4 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 MediaType (javax.ws.rs.core.MediaType)3 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)3 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)3 NoSuchPartException (org.apache.stanbol.enhancer.servicesapi.NoSuchPartException)3 StreamSource (org.apache.stanbol.enhancer.servicesapi.impl.StreamSource)3