Search in sources :

Example 1 with Model

use of org.ontoware.rdf2go.model.Model in project stanbol by apache.

the class SimpleMailExtractor method main.

public static void main(String[] args) throws Exception {
    int argv = 0;
    SimpleMailExtractor extractor = new SimpleMailExtractor();
    RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
    for (int i = argv; i < args.length; ++i) {
        File file = new File(args[i]);
        InputStream in = new FileInputStream(file);
        URI uri = new URIImpl(file.toURI().toString());
        RDFContainer rdfContainer = rdfFactory.getRDFContainer(uri);
        extractor.extract(uri, in, null, null, rdfContainer);
        Model model = rdfContainer.getModel();
        model.writeTo(System.out, Syntax.RdfXml);
        model.close();
    }
}
Also used : RDFContainer(org.semanticdesktop.aperture.rdf.RDFContainer) ByteArrayInputStream(java.io.ByteArrayInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Model(org.ontoware.rdf2go.model.Model) URIImpl(org.ontoware.rdf2go.model.node.impl.URIImpl) RDFContainerFactory(org.semanticdesktop.aperture.rdf.RDFContainerFactory) RDFContainerFactoryImpl(org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl) File(java.io.File) URI(org.ontoware.rdf2go.model.node.URI) FileInputStream(java.io.FileInputStream)

Example 2 with Model

use of org.ontoware.rdf2go.model.Model in project stanbol by apache.

the class SimpleMailExtractor method processContent.

// the recursive part
protected void processContent(Object content, StringBuilder buffer, RDFContainer rdf) throws MessagingException, IOException, ExtractorException {
    if (content instanceof String) {
        buffer.append(content);
        buffer.append(' ');
    } else if (content instanceof BodyPart) {
        BodyPart bodyPart = (BodyPart) content;
        DataHandler handler = bodyPart.getDataHandler();
        String encoding = null;
        if (handler != null) {
            encoding = MimeUtility.getEncoding(handler);
        }
        String fileName = bodyPart.getFileName();
        String contentType = bodyPart.getContentType();
        if (fileName != null) {
            try {
                fileName = MimeUtility.decodeWord(fileName);
            } catch (MessagingException e) {
            // happens on unencoded file names! so just ignore it and leave the file name as it is
            }
            URI attachURI = URIGenerator.createNewRandomUniqueURI();
            rdf.add(NMO.hasAttachment, attachURI);
            Model m = rdf.getModel();
            m.addStatement(attachURI, RDF.type, NFO.Attachment);
            m.addStatement(attachURI, NFO.fileName, fileName);
            if (handler != null) {
                if (encoding != null) {
                    m.addStatement(attachURI, NFO.encoding, encoding);
                }
            }
            if (contentType != null) {
                contentType = (new ContentType(contentType)).getBaseType();
                m.addStatement(attachURI, NIE.mimeType, contentType.trim());
            }
        // TODO: encoding?
        }
        // append the content, if any
        content = bodyPart.getContent();
        // remove any html markup if necessary
        if (contentType != null && content instanceof String) {
            contentType = contentType.toLowerCase();
            if (contentType.indexOf("text/html") >= 0) {
                if (encoding != null) {
                    encoding = MimeUtility.javaCharset(encoding);
                }
                content = extractTextFromHtml((String) content, encoding, rdf);
            }
        }
        processContent(content, buffer, rdf);
    } else if (content instanceof Multipart) {
        Multipart multipart = (Multipart) content;
        String subType = null;
        String contentType = multipart.getContentType();
        if (contentType != null) {
            ContentType ct = new ContentType(contentType);
            subType = ct.getSubType();
            if (subType != null) {
                subType = subType.trim().toLowerCase();
            }
        }
        if ("alternative".equals(subType)) {
            handleAlternativePart(multipart, buffer, rdf);
        } else if ("signed".equals(subType)) {
            handleProtectedPart(multipart, 0, buffer, rdf);
        } else if ("encrypted".equals(subType)) {
            handleProtectedPart(multipart, 1, buffer, rdf);
        } else {
            // handles multipart/mixed, /digest, /related, /parallel, /report and unknown subtypes
            handleMixedPart(multipart, buffer, rdf);
        }
    }
}
Also used : BodyPart(javax.mail.BodyPart) Multipart(javax.mail.Multipart) ContentType(javax.mail.internet.ContentType) MessagingException(javax.mail.MessagingException) Model(org.ontoware.rdf2go.model.Model) DataHandler(javax.activation.DataHandler) URI(org.ontoware.rdf2go.model.node.URI)

Example 3 with Model

use of org.ontoware.rdf2go.model.Model in project stanbol by apache.

the class SimpleMailExtractor method extractTextFromHtml.

protected String extractTextFromHtml(String string, String charset, RDFContainer rdf) throws ExtractorException {
    // parse the HTML and extract full-text and metadata
    HtmlTextExtractUtil extractor;
    try {
        extractor = new HtmlTextExtractUtil();
    } catch (InitializationException e) {
        throw new ExtractorException("Could not initialize HtmlExtractor: " + e.getMessage());
    }
    InputStream stream = new ByteArrayInputStream(string.getBytes());
    RDFContainerFactory containerFactory = new RDFContainerFactoryImpl();
    URI id = rdf.getDescribedUri();
    RDFContainer result = containerFactory.getRDFContainer(id);
    extractor.extract(id, charset, stream, result);
    Model meta = result.getModel();
    // append metadata and full-text to a string buffer
    StringBuilder buffer = new StringBuilder(32 * 1024);
    append(buffer, extractor.getTitle(meta), "\n");
    append(buffer, extractor.getAuthor(meta), "\n");
    append(buffer, extractor.getDescription(meta), "\n");
    List<String> keywords = extractor.getKeywords(meta);
    for (String kw : keywords) {
        append(buffer, kw, " ");
    }
    buffer.append("\n");
    append(buffer, extractor.getText(meta), " ");
    logger.debug("text extracted:\n{}", buffer);
    meta.close();
    // return the buffer's content
    return buffer.toString();
}
Also used : RDFContainer(org.semanticdesktop.aperture.rdf.RDFContainer) ByteArrayInputStream(java.io.ByteArrayInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) RDFContainerFactory(org.semanticdesktop.aperture.rdf.RDFContainerFactory) InitializationException(org.apache.stanbol.enhancer.engines.metaxa.core.html.InitializationException) URI(org.ontoware.rdf2go.model.node.URI) ByteArrayInputStream(java.io.ByteArrayInputStream) HtmlTextExtractUtil(org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlTextExtractUtil) ExtractorException(org.semanticdesktop.aperture.extractor.ExtractorException) Model(org.ontoware.rdf2go.model.Model) RDFContainerFactoryImpl(org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl)

Example 4 with Model

use of org.ontoware.rdf2go.model.Model in project stanbol by apache.

the class TestMetaxaCore method testRdfaExtraction.

/**
 * This tests the html extraction.
 *
 * @throws ExtractorException if there is an error during extraction
 * @throws IOException if there is an error when reading the document
 */
@Test
public void testRdfaExtraction() throws Exception {
    String testFile = "test-rdfa.html";
    String testResultFile = "rdfa-res.txt";
    // extract text from RDFa annotated html
    InputStream in = getResourceAsStream(testFile);
    assertNotNull("failed to load resource " + testFile, in);
    Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
    String text = MetaxaCore.getText(m);
    // get expected result
    InputStream in2 = getResourceAsStream(testResultFile);
    assertNotNull("failed to load resource " + testResultFile, in2);
    String expectedText = IOUtils.toString(in2, "utf-8");
    // test
    assertEquals(cleanup(expectedText), cleanup(text));
    // show triples
    int tripleCounter = this.printTriples(m);
    assertEquals(10, tripleCounter);
}
Also used : InputStream(java.io.InputStream) Model(org.ontoware.rdf2go.model.Model) URIImpl(org.ontoware.rdf2go.model.node.impl.URIImpl) Test(org.junit.Test)

Example 5 with Model

use of org.ontoware.rdf2go.model.Model in project stanbol by apache.

the class RDF2GoUtils method urifyBlankNodes.

public static void urifyBlankNodes(Model model) {
    HashMap<BlankNode, URI> nodeMap = new HashMap<BlankNode, URI>();
    Model add = RDF2Go.getModelFactory().createModel();
    add.open();
    Model remove = RDF2Go.getModelFactory().createModel();
    remove.open();
    for (Statement stmt : model) {
        RDFTerm subj = stmt.getSubject();
        URI pred = stmt.getPredicate();
        Node obj = stmt.getObject();
        boolean match = false;
        if (subj instanceof BlankNode) {
            match = true;
            URI newSubj = nodeMap.get(subj);
            if (newSubj == null) {
                newSubj = URIGenerator.createNewRandomUniqueURI();
                nodeMap.put(subj.asBlankNode(), newSubj);
            }
            subj = newSubj;
        }
        if (obj instanceof BlankNode) {
            match = true;
            URI newObj = nodeMap.get(obj);
            if (newObj == null) {
                newObj = URIGenerator.createNewRandomUniqueURI();
                nodeMap.put(obj.asBlankNode(), newObj);
            }
            obj = newObj;
        }
        if (match) {
            remove.addStatement(stmt);
            add.addStatement(subj, pred, obj);
        }
    }
    ClosableIterator<Statement> addIt = add.iterator();
    ClosableIterator<Statement> removeIt = remove.iterator();
    model.update(new DiffImpl(addIt, removeIt));
    addIt.close();
    removeIt.close();
    add.close();
    remove.close();
}
Also used : HashMap(java.util.HashMap) Statement(org.ontoware.rdf2go.model.Statement) Node(org.ontoware.rdf2go.model.node.Node) BlankNode(org.ontoware.rdf2go.model.node.BlankNode) BlankNode(org.ontoware.rdf2go.model.node.BlankNode) Model(org.ontoware.rdf2go.model.Model) DiffImpl(org.ontoware.rdf2go.model.impl.DiffImpl) RDFTerm(org.ontoware.rdf2go.model.node.RDFTerm) URI(org.ontoware.rdf2go.model.node.URI)

Aggregations

Model (org.ontoware.rdf2go.model.Model)11 InputStream (java.io.InputStream)6 URIImpl (org.ontoware.rdf2go.model.node.impl.URIImpl)6 Test (org.junit.Test)4 URI (org.ontoware.rdf2go.model.node.URI)4 RDFContainer (org.semanticdesktop.aperture.rdf.RDFContainer)3 RDFContainerFactory (org.semanticdesktop.aperture.rdf.RDFContainerFactory)3 RDFContainerFactoryImpl (org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl)3 ByteArrayInputStream (java.io.ByteArrayInputStream)2 FileInputStream (java.io.FileInputStream)2 HashMap (java.util.HashMap)2 Statement (org.ontoware.rdf2go.model.Statement)2 BlankNode (org.ontoware.rdf2go.model.node.BlankNode)2 RDFTerm (org.ontoware.rdf2go.model.node.RDFTerm)2 ExtractorException (org.semanticdesktop.aperture.extractor.ExtractorException)2 BufferedInputStream (java.io.BufferedInputStream)1 BufferedWriter (java.io.BufferedWriter)1 File (java.io.File)1 IOException (java.io.IOException)1 OutputStreamWriter (java.io.OutputStreamWriter)1