Search in sources :

Example 1 with URIImpl

use of org.ontoware.rdf2go.model.node.impl.URIImpl in project stanbol by apache.

the class IksHtmlExtractor method main.

public static void main(String[] args) throws Exception {
    int argv = 0;
    IksHtmlExtractor inst = new IksHtmlExtractor();
    RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
    for (int i = argv; i < args.length; ++i) {
        File file = new File(args[i]);
        InputStream input = new FileInputStream(file);
        Charset charset = Charset.forName("UTF-8");
        String mimeType = "text/html";
        URI uri = new URIImpl(file.toURI().toString());
        RDFContainer container = rdfFactory.getRDFContainer(uri);
        inst.extract(uri, input, charset, mimeType, container);
        System.out.println("Model for " + args[i]);
        container.getModel().writeTo(System.out);
        System.out.println();
        container.dispose();
    }
}
Also used : RDFContainer(org.semanticdesktop.aperture.rdf.RDFContainer) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Charset(java.nio.charset.Charset) URIImpl(org.ontoware.rdf2go.model.node.impl.URIImpl) RDFContainerFactory(org.semanticdesktop.aperture.rdf.RDFContainerFactory) RDFContainerFactoryImpl(org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl) File(java.io.File) URI(org.ontoware.rdf2go.model.node.URI) FileInputStream(java.io.FileInputStream)

Example 2 with URIImpl

use of org.ontoware.rdf2go.model.node.impl.URIImpl in project stanbol by apache.

the class SimpleMailExtractor method main.

public static void main(String[] args) throws Exception {
    int argv = 0;
    SimpleMailExtractor extractor = new SimpleMailExtractor();
    RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
    for (int i = argv; i < args.length; ++i) {
        File file = new File(args[i]);
        InputStream in = new FileInputStream(file);
        URI uri = new URIImpl(file.toURI().toString());
        RDFContainer rdfContainer = rdfFactory.getRDFContainer(uri);
        extractor.extract(uri, in, null, null, rdfContainer);
        Model model = rdfContainer.getModel();
        model.writeTo(System.out, Syntax.RdfXml);
        model.close();
    }
}
Also used : RDFContainer(org.semanticdesktop.aperture.rdf.RDFContainer) ByteArrayInputStream(java.io.ByteArrayInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Model(org.ontoware.rdf2go.model.Model) URIImpl(org.ontoware.rdf2go.model.node.impl.URIImpl) RDFContainerFactory(org.semanticdesktop.aperture.rdf.RDFContainerFactory) RDFContainerFactoryImpl(org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl) File(java.io.File) URI(org.ontoware.rdf2go.model.node.URI) FileInputStream(java.io.FileInputStream)

Example 3 with URIImpl

use of org.ontoware.rdf2go.model.node.impl.URIImpl in project stanbol by apache.

the class TestMetaxaCore method testRdfaExtraction.

/**
     * This tests the html extraction.
     *
     * @throws ExtractorException if there is an error during extraction
     * @throws IOException if there is an error when reading the document
     */
@Test
public void testRdfaExtraction() throws Exception {
    String testFile = "test-rdfa.html";
    String testResultFile = "rdfa-res.txt";
    // extract text from RDFa annotated html
    InputStream in = getResourceAsStream(testFile);
    assertNotNull("failed to load resource " + testFile, in);
    Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
    String text = MetaxaCore.getText(m);
    // get expected result
    InputStream in2 = getResourceAsStream(testResultFile);
    assertNotNull("failed to load resource " + testResultFile, in2);
    String expectedText = IOUtils.toString(in2, "utf-8");
    // test
    assertEquals(cleanup(expectedText), cleanup(text));
    // show triples
    int tripleCounter = this.printTriples(m);
    assertEquals(10, tripleCounter);
}
Also used : InputStream(java.io.InputStream) Model(org.ontoware.rdf2go.model.Model) URIImpl(org.ontoware.rdf2go.model.node.impl.URIImpl) Test(org.junit.Test)

Example 4 with URIImpl

use of org.ontoware.rdf2go.model.node.impl.URIImpl in project stanbol by apache.

the class TestMetaxaCore method testPdfExtraction.

/**
     * This tests the pdf extraction.
     *
     * @throws ExtractorException if there is an error during extraction
     * @throws IOException if there is an error when reading the document
     */
@Test
public void testPdfExtraction() throws Exception {
    String testFile = "test.pdf";
    String testResultFile = "pdf-res.txt";
    // extract text from pdf
    InputStream in = getResourceAsStream(testFile);
    assertNotNull("failed to load resource " + testFile, in);
    Model m = extractor.extract(in, new URIImpl("file://" + testFile), "application/pdf");
    String text = MetaxaCore.getText(m);
    // get expected result
    InputStream in2 = getResourceAsStream(testResultFile);
    assertNotNull("failed to load resource " + testResultFile, in2);
    String expectedText = IOUtils.toString(in2, "utf-8");
    // test
    assertEquals(cleanup(expectedText), cleanup(text));
    // show triples
    int tripleCounter = this.printTriples(m);
    assertEquals(11, tripleCounter);
}
Also used : InputStream(java.io.InputStream) Model(org.ontoware.rdf2go.model.Model) URIImpl(org.ontoware.rdf2go.model.node.impl.URIImpl) Test(org.junit.Test)

Example 5 with URIImpl

use of org.ontoware.rdf2go.model.node.impl.URIImpl in project stanbol by apache.

the class MetaxaEngine method computeEnhancements.

public void computeEnhancements(ContentItem ci) throws EngineException {
    // get model from the extraction
    URIImpl docId;
    Model m = null;
    ci.getLock().readLock().lock();
    try {
        docId = new URIImpl(ci.getUri().getUnicodeString());
        m = this.extractor.extract(ci.getStream(), docId, ci.getMimeType());
    } catch (ExtractorException e) {
        throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
    } catch (IOException e) {
        throw new EngineException("Error while processing ContentItem " + ci.getUri() + " with Metaxa", e);
    } finally {
        ci.getLock().readLock().unlock();
    }
    // the extracted plain text from the model
    if (null == m) {
        log.debug("Unable to preocess ContentItem {} (mime type {}) with Metaxa", ci.getUri(), ci.getMimeType());
        return;
    }
    ContentSink plainTextSink;
    try {
        plainTextSink = ciFactory.createContentSink("text/plain");
    } catch (IOException e) {
        m.close();
        throw new EngineException("Unable to initialise Blob for storing" + "the plain text content", e);
    }
    HashMap<BlankNode, BlankNode> blankNodeMap = new HashMap<BlankNode, BlankNode>();
    RDF2GoUtils.urifyBlankNodes(m);
    ClosableIterator<Statement> it = m.iterator();
    BufferedWriter out = new BufferedWriter(new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8));
    //used to detect if some text was extracted
    boolean textExtracted = false;
    try {
        //first add to a temporary graph
        Graph g = new SimpleGraph();
        while (it.hasNext()) {
            Statement oneStmt = it.next();
            //the plain text Blob!
            if (oneStmt.getSubject().equals(docId) && oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)) {
                String text = oneStmt.getObject().toString();
                if (text != null && !text.isEmpty()) {
                    try {
                        out.write(oneStmt.getObject().toString());
                    } catch (IOException e) {
                        throw new EngineException("Unable to write extracted" + "plain text to Blob (blob impl: " + plainTextSink.getBlob().getClass() + ")", e);
                    }
                    textExtracted = true;
                    if (includeText) {
                        BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
                        IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
                        RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
                        g.add(new TripleImpl(subject, predicate, object));
                    }
                }
            } else {
                //add metadata to the metadata of the contentItem
                BlankNodeOrIRI subject = (BlankNodeOrIRI) asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
                IRI predicate = (IRI) asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
                RDFTerm object = asClerezzaResource(oneStmt.getObject(), blankNodeMap);
                if (null != subject && null != predicate && null != object) {
                    Triple t = new TripleImpl(subject, predicate, object);
                    g.add(t);
                    log.debug("added " + t.toString());
                }
            }
        }
        //add the extracted triples to the metadata of the ContentItem
        ci.getLock().writeLock().lock();
        try {
            ci.getMetadata().addAll(g);
            g = null;
        } finally {
            ci.getLock().writeLock().unlock();
        }
    } finally {
        it.close();
        m.close();
        IOUtils.closeQuietly(out);
    }
    if (textExtracted) {
        //add plain text to the content item
        IRI blobUri = new IRI("urn:metaxa:plain-text:" + randomUUID());
        ci.addPart(blobUri, plainTextSink.getBlob());
    }
}
Also used : IRI(org.apache.clerezza.commons.rdf.IRI) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) HashMap(java.util.HashMap) Statement(org.ontoware.rdf2go.model.Statement) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) BlankNode(org.apache.clerezza.commons.rdf.BlankNode) BlankNode(org.ontoware.rdf2go.model.node.BlankNode) BlankNodeOrIRI(org.apache.clerezza.commons.rdf.BlankNodeOrIRI) URIImpl(org.ontoware.rdf2go.model.node.impl.URIImpl) RDFTerm(org.apache.clerezza.commons.rdf.RDFTerm) IOException(java.io.IOException) BufferedWriter(java.io.BufferedWriter) Triple(org.apache.clerezza.commons.rdf.Triple) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) Graph(org.apache.clerezza.commons.rdf.Graph) Model(org.ontoware.rdf2go.model.Model) ExtractorException(org.semanticdesktop.aperture.extractor.ExtractorException) SimpleGraph(org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph) OutputStreamWriter(java.io.OutputStreamWriter) ContentSink(org.apache.stanbol.enhancer.servicesapi.ContentSink) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl)

Aggregations

URIImpl (org.ontoware.rdf2go.model.node.impl.URIImpl)7 InputStream (java.io.InputStream)6 Model (org.ontoware.rdf2go.model.Model)6 Test (org.junit.Test)4 File (java.io.File)2 FileInputStream (java.io.FileInputStream)2 URI (org.ontoware.rdf2go.model.node.URI)2 RDFContainer (org.semanticdesktop.aperture.rdf.RDFContainer)2 RDFContainerFactory (org.semanticdesktop.aperture.rdf.RDFContainerFactory)2 RDFContainerFactoryImpl (org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl)2 BufferedInputStream (java.io.BufferedInputStream)1 BufferedWriter (java.io.BufferedWriter)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 IOException (java.io.IOException)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Charset (java.nio.charset.Charset)1 HashMap (java.util.HashMap)1 BlankNode (org.apache.clerezza.commons.rdf.BlankNode)1 BlankNodeOrIRI (org.apache.clerezza.commons.rdf.BlankNodeOrIRI)1 Graph (org.apache.clerezza.commons.rdf.Graph)1