Search in sources :

Example 1 with Statement

use of org.ontoware.rdf2go.model.Statement in project stanbol by apache.

the class TestMetaxaCore method printTriples.

/**
     * This prints out the Stanbol Enhancer triples that would be created for the metadata
     * contained in the given model.
     *
     * @param m a {@link Model}
     *
     * @return an {@code int} with the number of added triples
     */
private int printTriples(Model m) {
    int tripleCounter = 0;
    HashMap<BlankNode, BlankNode> blankNodeMap = new HashMap<BlankNode, BlankNode>();
    ClosableIterator<Statement> it = m.iterator();
    while (it.hasNext()) {
        Statement oneStmt = it.next();
        BlankNodeOrIRI subject = (BlankNodeOrIRI) MetaxaEngine.asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
        IRI predicate = (IRI) MetaxaEngine.asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
        RDFTerm object = MetaxaEngine.asClerezzaResource(oneStmt.getObject(), blankNodeMap);
        if (null != subject && null != predicate && null != object) {
            Triple t = new TripleImpl(subject, predicate, object);
            LOG.debug("adding " + t);
            tripleCounter++;
        } else {
            LOG.debug("skipped " + oneStmt.toString());
        }
    }
    it.close();
    return tripleCounter;
}
Also used : Triple(org.apache.clerezza.commons.rdf.Triple) IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) HashMap(java.util.HashMap) Statement(org.ontoware.rdf2go.model.Statement) BlankNode(org.apache.clerezza.commons.rdf.BlankNode) BlankNode(org.ontoware.rdf2go.model.node.BlankNode) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 2 with Statement

use of org.ontoware.rdf2go.model.Statement in project stanbol by apache.

the class RDF2GoUtils method urifyBlankNodes.

public static void urifyBlankNodes(Model model) {
    HashMap<BlankNode, URI> nodeMap = new HashMap<BlankNode, URI>();
    Model add = RDF2Go.getModelFactory().createModel();
    add.open();
    Model remove = RDF2Go.getModelFactory().createModel();
    remove.open();
    for (Statement stmt : model) {
        RDFTerm subj = stmt.getSubject();
        URI pred = stmt.getPredicate();
        Node obj = stmt.getObject();
        boolean match = false;
        if (subj instanceof BlankNode) {
            match = true;
            URI newSubj = nodeMap.get(subj);
            if (newSubj == null) {
                newSubj = URIGenerator.createNewRandomUniqueURI();
                nodeMap.put(subj.asBlankNode(), newSubj);
            }
            subj = newSubj;
        }
        if (obj instanceof BlankNode) {
            match = true;
            URI newObj = nodeMap.get(obj);
            if (newObj == null) {
                newObj = URIGenerator.createNewRandomUniqueURI();
                nodeMap.put(obj.asBlankNode(), newObj);
            }
            obj = newObj;
        }
        if (match) {
            remove.addStatement(stmt);
            add.addStatement(subj, pred, obj);
        }
    }
    ClosableIterator<Statement> addIt = add.iterator();
    ClosableIterator<Statement> removeIt = remove.iterator();
    model.update(new DiffImpl(addIt, removeIt));
    addIt.close();
    removeIt.close();
    add.close();
    remove.close();
}
Also used : HashMap(java.util.HashMap) Statement(org.ontoware.rdf2go.model.Statement) Node(org.ontoware.rdf2go.model.node.Node) BlankNode(org.ontoware.rdf2go.model.node.BlankNode) BlankNode(org.ontoware.rdf2go.model.node.BlankNode) Model(org.ontoware.rdf2go.model.Model) DiffImpl(org.ontoware.rdf2go.model.impl.DiffImpl) RDFTerm(org.ontoware.rdf2go.model.node.RDFTerm) URI(org.ontoware.rdf2go.model.node.URI)

Example 3 with Statement

use of org.ontoware.rdf2go.model.Statement in project stanbol by apache.

the class MetaxaEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    // get model from the extraction
    URIImpl docId;
    Model m = null;
    ci.getLock().readLock().lock();
    try {
        docId = new URIImpl(ci.getUri().getUnicodeString());
        m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
    } catch (ExtractorException e) {
        throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
    } catch (IOException e) {
        throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
    } finally {
        ci.getLock().readLock().unlock();
    }
    // the extracted plain text from the model
    if (null == m) {
        log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa", ci.getUri(), ci.getMimeType());
        return;
    }
    ContentSink plainTextSink;
    try {
        plainTextSink = ciFactory.createContentSink("text/plain");
    } catch (IOException e) {
        m.close();
        throw new EngineException("Unable to initialise Blob for storing" + "the plain text content", e);
    }
    HashMap<BlankNode, BlankNode> blankNodeMap = new HashMap<BlankNode, BlankNode>();
    RDF2GoUtils.urifyBlankNodes(m);
    ClosableIterator<Statement> it = m.iterator();
    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8));
    //used to detect if some text was extracted
    boolean textExtracted = false;
    try {
        //first add to a temporary graph
        Graph g = new SimpleGraph();
        while (it.hasNext()) {
            Statement oneStmt = it.next();
            //the plain text Blob!
            if (oneStmt.getSubject().equals(docId) && oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)) {
                String text = oneStmt.getObject().toString();
                if (text != null && !text.isEmpty()) {
                    try {
                        out.write(oneStmt.getObject().toString());
                    } catch (IOException e) {
                        throw new EngineException("Unable to write extracted" + "plain text to Blob (blob impl: " + plainTextSink.getBlob().getClass() + ")", e);
                    }
                    textExtracted = true;
                    if (includeText) {
                        BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
                        IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
                        RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
                        g.add(new TripleImpl(subject, predicate, object));
                    }
                }
            } else {
                //add metadata to the metadata of the contentItem
                BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
                IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
                RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
                if (null != subject && null != predicate && null != object) {
                    Triple t = new TripleImpl(subject, predicate, object);
                    g.add(t);
                    log.debug("added " + t.toString());
                }
            }
        }
        //add the extracted triples to the metadata of the ContentItem
        ci.getLock().writeLock().lock();
        try {
            ci.getMetadata().addAll(g);
            g = null;
        } finally {
            ci.getLock().writeLock().unlock();
        }
    } finally {
        it.close();
        m.close();
        IOUtils.closeQuietly(out);
    }
    if (textExtracted) {
        //add plain text to the content item
        IRI blobUri = new IRI("urn:metaxa:plain-text:" + randomUUID());
        ci.addPart(blobUri, plainTextSink.getBlob());
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) HashMap(java.util.HashMap) Statement(org.ontoware.rdf2go.model.Statement) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) BlankNode(org.apache.clerezza.commons.rdf.BlankNode) BlankNode(org.ontoware.rdf2go.model.node.BlankNode) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) URIImpl(org.ontoware.rdf2go.model.node.impl.URIImpl) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) IOException(java.io.IOException) BufferedWriter(java.io.BufferedWriter) Triple(org.apache.clerezza.commons.rdf.Triple) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) Model(org.ontoware.rdf2go.model.Model) ExtractorException(org.semanticdesktop.aperture.extractor.ExtractorException) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) OutputStreamWriter(java.io.OutputStreamWriter) ContentSink(org.apache.stanbol.enhancer.servicesapi.ContentSink) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Example 4 with Statement

use of org.ontoware.rdf2go.model.Statement in project stanbol by apache.

the class MetaxaCore method getText.

/**
     * Returns a documents plain text if contained in the given extracted
     * metadata.
     *
     * @param model
     *            a {@link Model} with the extracted metadata
     * @return a {@link String} with the plain text content or {@code null} if
     *         no plain text was contained in the extracted metadata
     */
public static String getText(Model model) {
    String result = null;
    ClosableIterator<Statement> statements = null;
    try {
        statements = model.findStatements(Variable.ANY, NIE.plainTextContent, Variable.ANY);
        StringBuilder text = new StringBuilder(10000);
        while (statements.hasNext()) {
            Statement statement = statements.next();
            Node value = statement.getObject();
            if (value instanceof Literal) {
                text.append(((Literal) value).getValue());
            }
        }
        result = text.toString().trim();
        if (result.length() == 0) {
            result = null;
        }
    } finally {
        if (statements != null) {
            statements.close();
        }
    }
    return result;
}
Also used : Statement(org.ontoware.rdf2go.model.Statement) Node(org.ontoware.rdf2go.model.node.Node) Literal(org.ontoware.rdf2go.model.node.Literal)

Aggregations

Statement (org.ontoware.rdf2go.model.Statement)4 HashMap (java.util.HashMap)3 BlankNode (org.ontoware.rdf2go.model.node.BlankNode)3 BlankNode (org.apache.clerezza.commons.rdf.BlankNode)2 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)2 IRI (org.apache.clerezza.commons.rdf.IRI)2 RDFTerm (org.apache.clerezza.commons.rdf.RDFTerm)2 Triple (org.apache.clerezza.commons.rdf.Triple)2 TripleImpl (org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)2 Model (org.ontoware.rdf2go.model.Model)2 Node (org.ontoware.rdf2go.model.node.Node)2 BufferedWriter (java.io.BufferedWriter)1 IOException (java.io.IOException)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Graph (org.apache.clerezza.commons.rdf.Graph)1 SimpleGraph (org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph)1 ContentSink (org.apache.stanbol.enhancer.servicesapi.ContentSink)1 EngineException (org.apache.stanbol.enhancer.servicesapi.EngineException)1 DiffImpl (org.ontoware.rdf2go.model.impl.DiffImpl)1 Literal (org.ontoware.rdf2go.model.node.Literal)1