Search in sources :

Example 31 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class XmpExtractorEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    InputStream in = ci.getBlob().getStream();
    XMPPacketScanner scanner = new XMPPacketScanner();
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    try {
        scanner.parse(in, baos);
    } catch (IOException e) {
        throw new EngineException(e);
    }
    byte[] bytes = baos.toByteArray();
    if (bytes.length > 0) {
        Graph model = new IndexedGraph();
        parser.parse(model, new ByteArrayInputStream(bytes), "application/rdf+xml");
        GraphNode gn = new GraphNode(new IRI("http://relative-uri.fake/"), model);
        gn.replaceWith(ci.getUri());
        ci.getLock().writeLock().lock();
        try {
            LOG.info("Model: {}", model);
            ci.getMetadata().addAll(model);
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) ImmutableGraph(org.apache.clerezza.commons.rdf.ImmutableGraph) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) XMPPacketScanner(org.apache.tika.parser.image.xmp.XMPPacketScanner) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) GraphNode(org.apache.clerezza.rdf.utils.GraphNode) ByteArrayOutputStream(java.io.ByteArrayOutputStream) IOException(java.io.IOException) IndexedGraph(org.apache.stanbol.commons.indexedgraph.IndexedGraph)

Example 32 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class OpenCalaisEngine method getCalaisAnalysis.

/**
 * Retrieves the annotations from OpenCalais as RDF/XML. From that an Graph is created.
 *
 * @param text the text to send to OpenCalais
 *
 * @return an Graph with all annotations
 *
 * @throws EngineException
 */
public Graph getCalaisAnalysis(String text, String mimeType) throws EngineException {
    if (mimeType.equals("text/plain")) {
        mimeType = "text/raw";
    }
    String calaisParams = "<c:params xmlns:c=\"http://s.opencalais.com/1/pred/\" xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\">" + "<c:processingDirectives c:contentType=\"" + mimeType + "\" " + // "c:enableMetadataType=\"GenericRelations\" "+
    "c:outputFormat=\"rdf/xml\" " + // NOTE (rw, 2012-05-29) changed to true while working on STANBOL-630
    "c:calculateRelevanceScore=\"true\" " + "c:omitOutputtingOriginalText=\"true\"" + ">" + "</c:processingDirectives>" + "</c:params>";
    Graph model = null;
    try {
        StringBuilder postParams = new StringBuilder();
        postParams.append("licenseID=").append(URLEncoder.encode(getLicenseKey(), "UTF-8")).append("&content=").append(URLEncoder.encode(text, "UTF-8")).append("&paramsXML=").append(URLEncoder.encode(calaisParams, "UTF-8"));
        // get annotations from Calais
        log.info("Calais request sent");
        String calaisResult = doPostRequest(this.getCalaisUrl(), null, postParams.toString(), "application/x-www-form-urlencoded", "UTF-8");
        log.info("Calais response received: {}", calaisResult.length());
        log.info("Calais response:\n {}", calaisResult);
        log.debug("Calais data:\n{}", calaisResult);
        // build model from Calais result
        InputStream in = new ByteArrayInputStream(calaisResult.getBytes("utf-8"));
        model = readModel(in, "application/rdf+xml");
    } catch (UnsupportedEncodingException e) {
        throw new EngineException(e.getMessage(), e);
    } catch (IOException e) {
        throw new EngineException(e.getMessage(), e);
    }
    return model;
}
Also used : ImmutableGraph(org.apache.clerezza.commons.rdf.ImmutableGraph) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException)

Example 33 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class FstLinkingEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    AnalysedText at;
    if (linkingMode != LinkingModeEnum.PLAIN) {
        // require AnalysedText contentPart
        at = getAnalysedText(this, ci, true);
    } else {
        // AnalysedText is optional in LinkingModeEnum.BASIC
        try {
            at = AnalysedTextUtils.getAnalysedText(ci);
        } catch (ClassCastException e) {
            // unexpected contentPart found under the URI expecting the AnalysedText
            at = null;
        }
    }
    final String content;
    if (at != null) {
        // we can get the content from the Analyzed text
        content = at.getSpan();
    } else {
        // no analyzed text ... read is from the text/plain blob
        try {
            content = ContentItemHelper.getText(NlpEngineHelper.getPlainText(this, ci, true).getValue());
        } catch (IOException e) {
            throw new EngineException(this, ci, "Unable to access plain/text content!", e);
        }
    }
    log.debug("  > AnalysedText {}", at);
    String language = getLanguage(this, ci, true);
    log.debug("  > Language {}", language);
    if (log.isDebugEnabled()) {
        log.debug("computeEnhancements for ContentItem {} language {} text={}", new Object[] { ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(content, 100) });
    }
    // TODO: we need to do the same for the the default matching language
    TaggingSession session;
    try {
        session = TaggingSession.createSession(indexConfig, language);
    } catch (CorpusException e) {
        throw new EngineException(this, ci, e);
    }
    if (!session.hasCorpus()) {
        // no corpus available for processing the request
        return;
    }
    long taggingStart = System.currentTimeMillis();
    final NavigableMap<int[], Tag> tags = new TreeMap<int[], Tag>(Tag.SPAN_COMPARATOR);
    try {
        // process the language of the document
        Corpus corpus = null;
        if (session.getLanguageCorpus() != null) {
            corpus = session.getLanguageCorpus();
            long t = System.currentTimeMillis();
            int d = tag(content, at, session, corpus, tags);
            log.info(" - {}: fst: {}ms (callback: {}ms)", new Object[] { corpus.getIndexedField(), System.currentTimeMillis() - t, d });
        }
        if (session.getDefaultCorpus() != null) {
            if (corpus == null) {
                corpus = session.getDefaultCorpus();
            }
            long t = System.currentTimeMillis();
            int d = tag(content, at, session, session.getDefaultCorpus(), tags);
            log.info(" - {}: fst: {}ms (callback: {}ms)", new Object[] { session.getDefaultCorpus().getIndexedField(), System.currentTimeMillis() - t, d });
        }
        long taggingEnd = System.currentTimeMillis();
        if (corpus == null) {
            throw new EngineException(this, ci, "No FST corpus found to process contentItem " + "language '" + session.getLanguage() + "'!", null);
        } else {
            if (session.getLanguageCorpus() != null && session.getDefaultCorpus() != null) {
                log.info(" - sum fst: {} ms", taggingEnd - taggingStart);
            }
        }
        int matches = match(content, tags.values(), session.entityMentionTypes);
        log.debug(" - loaded {} ({} loaded, {} cached, {} appended) Matches in {} ms", new Object[] { matches, session.getSessionDocLoaded(), session.getSessionDocCached(), session.getSessionDocAppended(), System.currentTimeMillis() - taggingEnd });
        if (log.isDebugEnabled() && session.getDocumentCache() != null) {
            log.debug("EntityCache Statistics: {}", session.getDocumentCache().printStatistics());
        }
    } catch (IOException e) {
        throw new EngineException(this, ci, e);
    } finally {
        session.close();
    }
    if (log.isTraceEnabled()) {
        log.trace("Tagged Entities:");
        for (Tag tag : tags.values()) {
            log.trace("[{},{}]: {}", new Object[] { tag.getStart(), tag.getEnd(), tag.getMatches() });
        }
    }
    ci.getLock().writeLock().lock();
    try {
        writeEnhancements(ci, content, tags.values(), language, elConfig.isWriteEntityRankings());
    } finally {
        ci.getLock().writeLock().unlock();
    }
    // help the GC
    tags.clear();
}
Also used : EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) IOException(java.io.IOException) TreeMap(java.util.TreeMap) Corpus(org.apache.stanbol.enhancer.engines.lucenefstlinking.TaggingSession.Corpus) NlpEngineHelper.getAnalysedText(org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText) AnalysedText(org.apache.stanbol.enhancer.nlp.model.AnalysedText) NerTag(org.apache.stanbol.enhancer.nlp.ner.NerTag)

Example 34 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class TestOpenCalaisEngine method testCalaisConnection.

@Test
public void testCalaisConnection() throws IOException, EngineException {
    Assume.assumeNotNull(calaisExtractor.getLicenseKey());
    ContentItem ci = wrapAsContentItem(TEST_TEXT);
    ci.getMetadata().add(new TripleImpl(ci.getUri(), Properties.DC_LANGUAGE, LiteralFactory.getInstance().createTypedLiteral("en")));
    Graph model;
    try {
        model = calaisExtractor.getCalaisAnalysis(TEST_TEXT, "text/plain");
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e);
        return;
    }
    Assert.assertNotNull("No model", model);
    Collection<CalaisEntityOccurrence> entities;
    try {
        entities = calaisExtractor.queryModel(model);
    } catch (EngineException e) {
        RemoteServiceHelper.checkServiceUnavailable(e);
        return;
    }
    LOG.info("Found entities: {}", entities.size());
    LOG.debug("Entities:\n{}", entities);
    Assert.assertFalse("No entities found!", entities.isEmpty());
}
Also used : Graph(org.apache.clerezza.commons.rdf.Graph) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) ContentItem(org.apache.stanbol.enhancer.servicesapi.ContentItem) Test(org.junit.Test)

Example 35 with EngineException

use of org.apache.stanbol.enhancer.servicesapi.EngineException in project stanbol by apache.

the class MetaxaEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    // get model from the extraction
    URIImpl docId;
    Model m = null;
    ci.getLock().readLock().lock();
    try {
        docId = new URIImpl(ci.getUri().getUnicodeString());
        m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
    } catch (ExtractorException e) {
        throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
    } catch (IOException e) {
        throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
    } finally {
        ci.getLock().readLock().unlock();
    }
    // the extracted plain text from the model
    if (null == m) {
        log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa", ci.getUri(), ci.getMimeType());
        return;
    }
    ContentSink plainTextSink;
    try {
        plainTextSink = ciFactory.createContentSink("text/plain");
    } catch (IOException e) {
        m.close();
        throw new EngineException("Unable to initialise Blob for storing" + "the plain text content", e);
    }
    HashMap<BlankNode, BlankNode> blankNodeMap = new HashMap<BlankNode, BlankNode>();
    RDF2GoUtils.urifyBlankNodes(m);
    ClosableIterator<Statement> it = m.iterator();
    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8));
    // used to detect if some text was extracted
    boolean textExtracted = false;
    try {
        // first add to a temporary graph
        Graph g = new SimpleGraph();
        while (it.hasNext()) {
            Statement oneStmt = it.next();
            // the plain text Blob!
            if (oneStmt.getSubject().equals(docId) && oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)) {
                String text = oneStmt.getObject().toString();
                if (text != null && !text.isEmpty()) {
                    try {
                        out.write(oneStmt.getObject().toString());
                    } catch (IOException e) {
                        throw new EngineException("Unable to write extracted" + "plain text to Blob (blob impl: " + plainTextSink.getBlob().getClass() + ")", e);
                    }
                    textExtracted = true;
                    if (includeText) {
                        BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
                        IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
                        RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
                        g.add(new TripleImpl(subject, predicate, object));
                    }
                }
            } else {
                // add metadata to the metadata of the contentItem
                BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
                IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
                RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
                if (null != subject && null != predicate && null != object) {
                    Triple t = new TripleImpl(subject, predicate, object);
                    g.add(t);
                    log.debug("added " + t.toString());
                }
            }
        }
        // add the extracted triples to the metadata of the ContentItem
        ci.getLock().writeLock().lock();
        try {
            ci.getMetadata().addAll(g);
            g = null;
        } finally {
            ci.getLock().writeLock().unlock();
        }
    } finally {
        it.close();
        m.close();
        IOUtils.closeQuietly(out);
    }
    if (textExtracted) {
        // add plain text to the content item
        IRI blobUri = new IRI("urn:metaxa:plain-text:" + randomUUID());
        ci.addPart(blobUri, plainTextSink.getBlob());
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) HashMap(java.util.HashMap) Statement(org.ontoware.rdf2go.model.Statement) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) BlankNode(org.apache.clerezza.commons.rdf.BlankNode) BlankNode(org.ontoware.rdf2go.model.node.BlankNode) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) URIImpl(org.ontoware.rdf2go.model.node.impl.URIImpl) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) IOException(java.io.IOException) BufferedWriter(java.io.BufferedWriter) Triple(org.apache.clerezza.commons.rdf.Triple) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) Model(org.ontoware.rdf2go.model.Model) ExtractorException(org.semanticdesktop.aperture.extractor.ExtractorException) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) OutputStreamWriter(java.io.OutputStreamWriter) ContentSink(org.apache.stanbol.enhancer.servicesapi.ContentSink) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Aggregations

EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)55 IRI (org.apache.clerezza.commons.rdf.IRI)37 IOException (java.io.IOException)33 Graph (org.apache.clerezza.commons.rdf.Graph)24 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)23 PlainLiteralImpl (org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl)20 AnalysedText (org.apache.stanbol.enhancer.nlp.model.AnalysedText)15 Blob (org.apache.stanbol.enhancer.servicesapi.Blob)15 HashMap (java.util.HashMap)13 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)13 ContentItem (org.apache.stanbol.enhancer.servicesapi.ContentItem)12 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)10 InvalidContentException (org.apache.stanbol.enhancer.servicesapi.InvalidContentException)10 Test (org.junit.Test)10 Triple (org.apache.clerezza.commons.rdf.Triple)9 InputStream (java.io.InputStream)8 SOAPException (javax.xml.soap.SOAPException)8 Token (org.apache.stanbol.enhancer.nlp.model.Token)8 Language (org.apache.clerezza.commons.rdf.Language)7 LiteralFactory (org.apache.clerezza.rdf.core.LiteralFactory)7