Search in sources :

Example 31 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class CeliLemmatizerEnhancementEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    String language = EnhancementEngineHelper.getLanguage(ci);
    if (!isLangSupported(language)) {
        throw new IllegalStateException("Call to computeEnhancement with unsupported language '" + language + " for ContentItem " + ci.getUri() + ": This is also checked " + "in the canEnhance method! -> This indicated an Bug in the " + "implementation of the " + "EnhancementJobManager!");
    }
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
    if (contentPart == null) {
        throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This " + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
    }
    String text;
    try {
        text = ContentItemHelper.getText(contentPart.getValue());
    } catch (IOException e) {
        throw new InvalidContentException(this, ci, e);
    }
    if (text.trim().length() == 0) {
        log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
        return;
    }
    Graph graph = ci.getMetadata();
    if (this.completeMorphoAnalysis) {
        this.addMorphoAnalysisEnhancement(ci, text, language, graph);
    } else {
        this.addLemmatizationEnhancement(ci, text, language, graph);
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) InvalidContentException(org.apache.stanbol.enhancer.servicesapi.InvalidContentException) Graph(org.apache.clerezza.commons.rdf.Graph) IOException(java.io.IOException)

Example 32 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class TextAnnotationsNewModelEngine method computeEnhancements.

/**
     * Computes the enhancements on the provided ContentItem.
     */
@Override
public void computeEnhancements(ContentItem contentItem) throws EngineException {
    Entry<IRI, Blob> textBlob = getBlob(contentItem, supportedMimeTypes);
    if (textBlob == null) {
        return;
    }
    String language = EnhancementEngineHelper.getLanguage(contentItem);
    Language lang = language == null ? null : new Language(language);
    String text;
    try {
        text = ContentItemHelper.getText(textBlob.getValue());
    } catch (IOException e) {
        throw new EngineException(this, contentItem, "Unable to read Plain Text Blob", e);
    }
    Set<Triple> addedTriples = new HashSet<Triple>();
    Graph metadata = contentItem.getMetadata();
    //extract all the necessary information within a read lock
    contentItem.getLock().readLock().lock();
    try {
        Iterator<Triple> it = metadata.filter(null, RDF_TYPE, ENHANCER_TEXTANNOTATION);
        while (it.hasNext()) {
            BlankNodeOrIRI ta = it.next().getSubject();
            boolean hasPrefix = metadata.filter(ta, ENHANCER_SELECTION_PREFIX, null).hasNext();
            boolean hasSuffix = metadata.filter(ta, ENHANCER_SELECTION_SUFFIX, null).hasNext();
            boolean hasSelected = metadata.filter(ta, ENHANCER_SELECTED_TEXT, null).hasNext();
            if (hasPrefix && hasSuffix && hasSelected) {
                //this TextAnnotation already uses the new model
                continue;
            }
            Integer start;
            if (!hasPrefix) {
                start = EnhancementEngineHelper.get(metadata, ta, ENHANCER_START, Integer.class, lf);
                if (start == null) {
                    log.debug("unable to add fise:selection-prefix to TextAnnotation {} " + "because fise:start is not present", ta);
                } else if (start < 0) {
                    log.warn("fise:start {} of TextAnnotation {} < 0! " + "Will not transform this TextAnnotation", start, ta);
                    start = 0;
                }
            } else {
                start = null;
            }
            Integer end;
            if (!hasSuffix) {
                end = EnhancementEngineHelper.get(metadata, ta, ENHANCER_END, Integer.class, lf);
                if (end == null) {
                    log.debug("unable to add fise:selection-suffix to TextAnnotation {} " + "because fise:end is not present", ta);
                } else if (end > text.length()) {
                    log.warn("fise:end {} of TextAnnotation {} > as the content length {}! " + "Will not transform this TextAnnotation", end, ta, text.length());
                    end = null;
                } else if (start != null && end < start) {
                    log.warn("fise:end {} < fise:start {} of TextAnnotation {}! " + "Will not transform this TextAnnotation", end, start, ta);
                    end = null;
                    start = null;
                }
            } else {
                end = null;
            }
            if (!hasPrefix && start != null) {
                addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_PREFIX, new PlainLiteralImpl(text.substring(Math.max(0, start - prefixSuffixSize), start), lang)));
            }
            if (!hasSuffix && end != null) {
                addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_SUFFIX, new PlainLiteralImpl(text.substring(end, Math.min(text.length(), end + prefixSuffixSize)), lang)));
            }
            if (!hasSelected && start != null && end != null) {
                //This adds missing fise:selected or fise:head/fise:tail if the selected text is to long
                int length = end - start;
                if (length > 3 * prefixSuffixSize) {
                    //add prefix/suffix
                    addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_HEAD, new PlainLiteralImpl(text.substring(start, start + prefixSuffixSize), lang)));
                    addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_TAIL, new PlainLiteralImpl(text.substring(end - prefixSuffixSize, end), lang)));
                } else {
                    //add missing fise:selected
                    String selection = text.substring(start, end);
                    addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(selection, lang)));
                    //check if we should also add an selection context
                    if (!metadata.filter(ta, ENHANCER_SELECTION_CONTEXT, null).hasNext()) {
                        addedTriples.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(EnhancementEngineHelper.getSelectionContext(text, selection, start), lang)));
                    }
                }
            }
        }
    } finally {
        contentItem.getLock().readLock().unlock();
    }
    //finally write the prefix/suffix triples within a write lock
    if (!addedTriples.isEmpty()) {
        contentItem.getLock().writeLock().lock();
        try {
            metadata.addAll(addedTriples);
        } finally {
            contentItem.getLock().writeLock().unlock();
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) ContentItemHelper.getBlob(org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper.getBlob) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) IOException(java.io.IOException) Triple(org.apache.clerezza.commons.rdf.Triple) Graph(org.apache.clerezza.commons.rdf.Graph) Language(org.apache.clerezza.commons.rdf.Language) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) HashSet(java.util.HashSet)

Example 33 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class RestfulNlpAnalysisEngine method computeEnhancements.

/**
     * Compute enhancements for supplied ContentItem. The results of the process
     * are expected to be stored in the metadata of the content item.
     * <p/>
     * The client (usually an {@link org.apache.stanbol.enhancer.servicesapi.EnhancementJobManager}) should take care of
     * persistent storage of the enhanced {@link org.apache.stanbol.enhancer.servicesapi.ContentItem}.
     * <p/>
     * This method creates a new POSContentPart using {@link org.apache.stanbol.enhancer.engines.pos.api.POSTaggerHelper#createContentPart} from a text/plain part and
     * stores it as a new part in the content item. The metadata is not changed.
     *
     * @throws org.apache.stanbol.enhancer.servicesapi.EngineException
     *          if the underlying process failed to work as
     *          expected
     */
@Override
public void computeEnhancements(final ContentItem ci) throws EngineException {
    //validate that the service is active
    checkRESTfulNlpAnalysisService();
    //get/create the AnalysedText
    final AnalysedText at = NlpEngineHelper.initAnalysedText(this, analysedTextFactory, ci);
    final Blob blob = at.getBlob();
    //send the text to the server
    final String language = getLanguage(this, ci, true);
    final HttpPost request = new HttpPost(analysisServiceUrl);
    request.addHeader(HttpHeaders.CONTENT_LANGUAGE, language);
    request.setEntity(new InputStreamEntity(blob.getStream(), blob.getContentLength(), ContentType.create(blob.getMimeType(), blob.getParameter().get("charset"))));
    //execute the request
    try {
        AccessController.doPrivileged(new PrivilegedExceptionAction<AnalysedText>() {

            public AnalysedText run() throws ClientProtocolException, IOException {
                return httpClient.execute(request, new AnalysisResponseHandler(at));
            }
        });
    } catch (PrivilegedActionException pae) {
        Exception e = pae.getException();
        if (e instanceof ClientProtocolException) {
            //force re-initialisation upon error
            setRESTfulNlpAnalysisServiceUnavailable();
            throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful NLP Analysis Service at " + analysisServiceUrl, e);
        } else if (e instanceof IOException) {
            //force re-initialisation upon error
            setRESTfulNlpAnalysisServiceUnavailable();
            throw new EngineException(this, ci, "Exception while executing Request " + "on RESTful NLP Analysis Service at " + analysisServiceUrl, e);
        } else {
            throw RuntimeException.class.cast(e);
        }
    }
    if (writeTextAnnotations) {
        //if enabled fise:TextAnnotations are created for Named Entities and Sentiments
        double positiveSent = 0.0;
        int positiveCount = 0;
        double negativeSent = 0.0;
        int negativeCount = 0;
        int sentimentCount = 0;
        Iterator<Span> spans = at.getEnclosed(EnumSet.of(SpanTypeEnum.Sentence, SpanTypeEnum.Chunk));
        Sentence context = null;
        Graph metadata = ci.getMetadata();
        Language lang = new Language(language);
        LiteralFactory lf = LiteralFactory.getInstance();
        ci.getLock().writeLock().lock();
        try {
            //write TextAnnotations for Named Entities
            while (spans.hasNext()) {
                Span span = spans.next();
                switch(span.getType()) {
                    case Sentence:
                        context = (Sentence) span;
                    //FALLThrough intended!!
                    default:
                        Value<NerTag> nerAnno = span.getAnnotation(NER_ANNOTATION);
                        if (nerAnno != null) {
                            IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
                            //add span related data
                            metadata.add(new TripleImpl(ta, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(span.getSpan(), lang)));
                            metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(span.getStart())));
                            metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(span.getEnd())));
                            metadata.add(new TripleImpl(ta, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(context == null ? getDefaultSelectionContext(at.getSpan(), span.getSpan(), span.getStart()) : context.getSpan(), lang)));
                            //add the NER type
                            if (nerAnno.value().getType() != null) {
                                metadata.add(new TripleImpl(ta, DC_TYPE, nerAnno.value().getType()));
                            }
                            if (nerAnno.probability() >= 0) {
                                metadata.add(new TripleImpl(ta, ENHANCER_CONFIDENCE, lf.createTypedLiteral(nerAnno.probability())));
                            }
                        }
                        Value<Double> sentimentAnnotation = span.getAnnotation(SENTIMENT_ANNOTATION);
                        if (sentimentAnnotation != null) {
                            //this span has a sentiment assigned
                            Double sentiment = sentimentAnnotation.value();
                            //Create a fise:TextAnnotation for the sentiment
                            IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
                            metadata.add(new TripleImpl(ta, ENHANCER_START, lf.createTypedLiteral(span.getStart())));
                            metadata.add(new TripleImpl(ta, ENHANCER_END, lf.createTypedLiteral(span.getEnd())));
                            metadata.add(new TripleImpl(ta, SENTIMENT_PROPERTY, lf.createTypedLiteral(sentiment)));
                            //add the generic dc:type used for all Sentiment annotation
                            metadata.add(new TripleImpl(ta, DC_TYPE, SENTIMENT_TYPE));
                            //determine the specific dc:type for the sentiment annotation
                            IRI ssoType = NIFHelper.SPAN_TYPE_TO_SSO_TYPE.get(span.getType());
                            if (ssoType != null) {
                                metadata.add(new TripleImpl(ta, DC_TYPE, ssoType));
                            }
                            //keep statistics for the overall sentiment for the Document
                            sentimentCount++;
                            if (sentiment > 0) {
                                positiveSent += sentiment;
                                positiveCount++;
                            } else if (sentiment < 0) {
                                negativeSent += sentiment;
                                negativeCount++;
                            }
                        }
                        break;
                }
            }
            //Add the annotation for the overall sentiment of the document 
            if (sentimentCount > 0) {
                IRI ta = EnhancementEngineHelper.createTextEnhancement(ci, this);
                //calculate the average sentiment for a document
                //TODO: Think on a better way to calculate a general sentiment value for a document.
                metadata.add(new TripleImpl(ta, SENTIMENT_PROPERTY, lf.createTypedLiteral((positiveSent + negativeSent) / sentimentCount)));
                if (positiveCount > 0) {
                    //average positive sentiment calculation for the document
                    metadata.add(new TripleImpl(ta, POSITIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(positiveSent / positiveCount)));
                }
                if (negativeCount > 0) {
                    //average negative sentiment calculation for the document
                    metadata.add(new TripleImpl(ta, NEGATIVE_SENTIMENT_PROPERTY, lf.createTypedLiteral(negativeSent / negativeCount)));
                }
                metadata.add(new TripleImpl(ta, DC_TYPE, SENTIMENT_TYPE));
                metadata.add(new TripleImpl(ta, DC_TYPE, DOCUMENT_SENTIMENT_TYPE));
            }
        // no sentiment annotation present ... nothing to do
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
//else do not write fise:TextAnnotations
}
Also used : HttpPost(org.apache.http.client.methods.HttpPost) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag) IRI(org.apache.clerezza.commons.rdf.IRI) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) Span(org.apache.stanbol.enhancer.nlp.model.Span) ClientProtocolException(org.apache.http.client.ClientProtocolException) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) Language(org.apache.clerezza.commons.rdf.Language) NlpEngineHelper.getLanguage(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getLanguage) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) Sentence(org.apache.stanbol.enhancer.nlp.model.Sentence) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) PrivilegedActionException(java.security.PrivilegedActionException) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) IOException(java.io.IOException) HttpException(org.apache.http.HttpException) ClientProtocolException(org.apache.http.client.ClientProtocolException) PrivilegedActionException(java.security.PrivilegedActionException) IOException(java.io.IOException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) URISyntaxException(java.net.URISyntaxException) ConfigurationException(org.osgi.service.cm.ConfigurationException) HttpResponseException(org.apache.http.client.HttpResponseException) InputStreamEntity(org.apache.http.entity.InputStreamEntity) LiteralFactory(org.apache.clerezza.rdf.core.LiteralFactory) Graph(org.apache.clerezza.commons.rdf.Graph)

Example 34 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class TikaEngineTest method testEMail.

@Test
public void testEMail() throws EngineException, IOException, ParseException {
    log.info(">>> testEMail <<<");
    ContentItem ci = createContentItem("test.email.txt", "message/rfc822");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "Julien Nioche commented on TIKA-461:", "I'll have a look at mime4j and try to use it in Tika", "> RFC822 messages not parsed", "Key: TIKA-461", "URL: https://issues.apache.org/jira/browse/TIKA-461");
    //validate XHTML results
    contentPart = ContentItemHelper.getBlob(ci, singleton("application/xhtml+xml"));
    assertNotNull(contentPart);
    Blob xhtmlBlob = contentPart.getValue();
    assertNotNull(xhtmlBlob);
    assertContentRegexp(xhtmlBlob, "<html xmlns=\"http://www.w3.org/1999/xhtml\">", "<title>\\[jira\\] Commented: \\(TIKA-461\\) RFC822 messages not parsed</title>", "<body><p>", "Julien Nioche commented on TIKA-461:", "I'll have a look at mime4j and try to use it in Tika", "&gt; RFC822 messages not parsed", "Key: TIKA-461", "URL: https://issues.apache.org/jira/browse/TIKA-461");
    //no check the extracted metadata!
    //DC
    //STANBOL-757: dc:date no longer added by Tika 1.2 (dc:created is still present)
    //verifyValue(ci, new IRI(NamespaceEnum.dc+"date"), XSD.dateTime,"2010-09-06T09:25:34Z");
    verifyValue(ci, new IRI(NamespaceEnum.dc + "format"), null, "message/rfc822");
    //STANBOL-757: dc:subject no longer added by Tika1.2 (dc:title is used instead)
    //verifyValue(ci, new IRI(NamespaceEnum.dc+"subject"), null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
    verifyValue(ci, new IRI(NamespaceEnum.dc + "title"), null, "[jira] Commented: (TIKA-461) RFC822 messages not parsed");
    verifyValue(ci, new IRI(NamespaceEnum.dc + "creator"), null, "Julien Nioche (JIRA) <jira@apache.org>");
    verifyValue(ci, new IRI(NamespaceEnum.dc + "created"), XSD.dateTime, "2010-09-06T09:25:34Z");
    //Media Ontology
    verifyValue(ci, new IRI(NamespaceEnum.media + "creationDate"), XSD.dateTime, "2010-09-06T09:25:34Z");
    verifyValue(ci, new IRI(NamespaceEnum.media + "hasFormat"), null, "message/rfc822");
    verifyValue(ci, new IRI(NamespaceEnum.media + "hasCreator"), null, "Julien Nioche (JIRA) <jira@apache.org>");
    verifyValue(ci, new IRI(NamespaceEnum.media + "hasContributor"), null, "Julien Nioche (JIRA) <jira@apache.org>");
    //STANBOL-757: This was present with Tika 1.1 because its mapping from dc:subject 
    //        verifyValue(ci, new IRI(NamespaceEnum.media+"hasKeyword"),null,"[jira] Commented: (TIKA-461) RFC822 messages not parsed");
    //Nepomuk Message
    String message = "http://www.semanticdesktop.org/ontologies/2007/03/22/nmo#";
    verifyValue(ci, new IRI(message + "from"), null, "Julien Nioche (JIRA) <jira@apache.org>");
    verifyValue(ci, new IRI(message + "to"), null, "dev@tika.apache.org");
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 35 with Blob

use of org.apache.stanbol.enhancer.servicesapi.Blob in project stanbol by apache.

the class TikaEngineTest method testXhtml.

@Test
public void testXhtml() throws EngineException, IOException {
    log.info(">>> testXhtml <<<");
    ContentItem ci = createContentItem("test.xhtml", XHTML.toString() + "; charset=UTF-8");
    assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
    engine.computeEnhancements(ci);
    Entry<IRI, Blob> contentPart = ContentItemHelper.getBlob(ci, singleton("text/plain"));
    assertNotNull(contentPart);
    Blob plainTextBlob = contentPart.getValue();
    assertNotNull(plainTextBlob);
    assertContentRegexp(plainTextBlob, "The Apache Stanbol Enhancer", "The Stanbol enhancer can detect famous cities");
    //only the original and the plain text
    // this asserts that no xhtml is parsed from the parsed xhtml content
    assertEquals(2, ContentItemHelper.getContentParts(ci, Blob.class).size());
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) Blob(org.apache.stanbol.enhancer.servicesapi.Blob) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Aggregations

Blob (org.apache.stanbol.enhancer.servicesapi.Blob)44 IRI (org.apache.clerezza.commons.rdf.IRI)36 Test (org.junit.Test)21 IOException (java.io.IOException)20 Graph (org.apache.clerezza.commons.rdf.Graph)17 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)15 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)15 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)15 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)14 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)11 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)11 SOAPException (javax.xml.soap.SOAPException)4 Language (org.apache.clerezza.commons.rdf.Language)4 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 MediaType (javax.ws.rs.core.MediaType)3 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)3 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)3 NoSuchPartException (org.apache.stanbol.enhancer.servicesapi.NoSuchPartException)3 StreamSource (org.apache.stanbol.enhancer.servicesapi.impl.StreamSource)3