Search in sources :

Example 1 with URI

use of org.ontoware.rdf2go.model.node.URI in project stanbol by apache.

the class IksHtmlExtractor method main.

public static void main(String[] args) throws Exception {
    int argv = 0;
    IksHtmlExtractor inst = new IksHtmlExtractor();
    RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
    for (int i = argv; i < args.length; ++i) {
        File file = new File(args[i]);
        InputStream input = new FileInputStream(file);
        Charset charset = Charset.forName("UTF-8");
        String mimeType = "text/html";
        URI uri = new URIImpl(file.toURI().toString());
        RDFContainer container = rdfFactory.getRDFContainer(uri);
        inst.extract(uri, input, charset, mimeType, container);
        System.out.println("Model for " + args[i]);
        container.getModel().writeTo(System.out);
        System.out.println();
        container.dispose();
    }
}
Also used : RDFContainer(org.semanticdesktop.aperture.rdf.RDFContainer) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Charset(java.nio.charset.Charset) URIImpl(org.ontoware.rdf2go.model.node.impl.URIImpl) RDFContainerFactory(org.semanticdesktop.aperture.rdf.RDFContainerFactory) RDFContainerFactoryImpl(org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl) File(java.io.File) URI(org.ontoware.rdf2go.model.node.URI) FileInputStream(java.io.FileInputStream)

Example 2 with URI

use of org.ontoware.rdf2go.model.node.URI in project stanbol by apache.

the class SimpleMailExtractor method extractTextFromHtml.

protected String extractTextFromHtml(String string, String charset, RDFContainer rdf) throws ExtractorException {
    // parse the HTML and extract full-text and metadata
    HtmlTextExtractUtil extractor;
    try {
        extractor = new HtmlTextExtractUtil();
    } catch (InitializationException e) {
        throw new ExtractorException("Could not initialize HtmlExtractor: " + e.getMessage());
    }
    InputStream stream = new ByteArrayInputStream(string.getBytes());
    RDFContainerFactory containerFactory = new RDFContainerFactoryImpl();
    URI id = rdf.getDescribedUri();
    RDFContainer result = containerFactory.getRDFContainer(id);
    extractor.extract(id, charset, stream, result);
    Model meta = result.getModel();
    // append metadata and full-text to a string buffer
    StringBuilder buffer = new StringBuilder(32 * 1024);
    append(buffer, extractor.getTitle(meta), "\n");
    append(buffer, extractor.getAuthor(meta), "\n");
    append(buffer, extractor.getDescription(meta), "\n");
    List<String> keywords = extractor.getKeywords(meta);
    for (String kw : keywords) {
        append(buffer, kw, " ");
    }
    buffer.append("\n");
    append(buffer, extractor.getText(meta), " ");
    logger.debug("text extracted:\n{}", buffer);
    meta.close();
    // return the buffer's content
    return buffer.toString();
}
Also used : RDFContainer(org.semanticdesktop.aperture.rdf.RDFContainer) ByteArrayInputStream(java.io.ByteArrayInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) RDFContainerFactory(org.semanticdesktop.aperture.rdf.RDFContainerFactory) InitializationException(org.apache.stanbol.enhancer.engines.metaxa.core.html.InitializationException) URI(org.ontoware.rdf2go.model.node.URI) ByteArrayInputStream(java.io.ByteArrayInputStream) HtmlTextExtractUtil(org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlTextExtractUtil) ExtractorException(org.semanticdesktop.aperture.extractor.ExtractorException) Model(org.ontoware.rdf2go.model.Model) RDFContainerFactoryImpl(org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl)

Example 3 with URI

use of org.ontoware.rdf2go.model.node.URI in project stanbol by apache.

the class SimpleMailExtractor method main.

public static void main(String[] args) throws Exception {
    int argv = 0;
    SimpleMailExtractor extractor = new SimpleMailExtractor();
    RDFContainerFactory rdfFactory = new RDFContainerFactoryImpl();
    for (int i = argv; i < args.length; ++i) {
        File file = new File(args[i]);
        InputStream in = new FileInputStream(file);
        URI uri = new URIImpl(file.toURI().toString());
        RDFContainer rdfContainer = rdfFactory.getRDFContainer(uri);
        extractor.extract(uri, in, null, null, rdfContainer);
        Model model = rdfContainer.getModel();
        model.writeTo(System.out, Syntax.RdfXml);
        model.close();
    }
}
Also used : RDFContainer(org.semanticdesktop.aperture.rdf.RDFContainer) ByteArrayInputStream(java.io.ByteArrayInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) Model(org.ontoware.rdf2go.model.Model) URIImpl(org.ontoware.rdf2go.model.node.impl.URIImpl) RDFContainerFactory(org.semanticdesktop.aperture.rdf.RDFContainerFactory) RDFContainerFactoryImpl(org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl) File(java.io.File) URI(org.ontoware.rdf2go.model.node.URI) FileInputStream(java.io.FileInputStream)

Example 4 with URI

use of org.ontoware.rdf2go.model.node.URI in project stanbol by apache.

the class SimpleMailExtractor method processContent.

// the recursive part
protected void processContent(Object content, StringBuilder buffer, RDFContainer rdf) throws MessagingException, IOException, ExtractorException {
    if (content instanceof String) {
        buffer.append(content);
        buffer.append(' ');
    } else if (content instanceof BodyPart) {
        BodyPart bodyPart = (BodyPart) content;
        DataHandler handler = bodyPart.getDataHandler();
        String encoding = null;
        if (handler != null) {
            encoding = MimeUtility.getEncoding(handler);
        }
        String fileName = bodyPart.getFileName();
        String contentType = bodyPart.getContentType();
        if (fileName != null) {
            try {
                fileName = MimeUtility.decodeWord(fileName);
            } catch (MessagingException e) {
            // happens on unencoded file names! so just ignore it and leave the file name as it is
            }
            URI attachURI = URIGenerator.createNewRandomUniqueURI();
            rdf.add(NMO.hasAttachment, attachURI);
            Model m = rdf.getModel();
            m.addStatement(attachURI, RDF.type, NFO.Attachment);
            m.addStatement(attachURI, NFO.fileName, fileName);
            if (handler != null) {
                if (encoding != null) {
                    m.addStatement(attachURI, NFO.encoding, encoding);
                }
            }
            if (contentType != null) {
                contentType = (new ContentType(contentType)).getBaseType();
                m.addStatement(attachURI, NIE.mimeType, contentType.trim());
            }
        // TODO: encoding?
        }
        // append the content, if any
        content = bodyPart.getContent();
        // remove any html markup if necessary
        if (contentType != null && content instanceof String) {
            contentType = contentType.toLowerCase();
            if (contentType.indexOf("text/html") >= 0) {
                if (encoding != null) {
                    encoding = MimeUtility.javaCharset(encoding);
                }
                content = extractTextFromHtml((String) content, encoding, rdf);
            }
        }
        processContent(content, buffer, rdf);
    } else if (content instanceof Multipart) {
        Multipart multipart = (Multipart) content;
        String subType = null;
        String contentType = multipart.getContentType();
        if (contentType != null) {
            ContentType ct = new ContentType(contentType);
            subType = ct.getSubType();
            if (subType != null) {
                subType = subType.trim().toLowerCase();
            }
        }
        if ("alternative".equals(subType)) {
            handleAlternativePart(multipart, buffer, rdf);
        } else if ("signed".equals(subType)) {
            handleProtectedPart(multipart, 0, buffer, rdf);
        } else if ("encrypted".equals(subType)) {
            handleProtectedPart(multipart, 1, buffer, rdf);
        } else {
            // handles multipart/mixed, /digest, /related, /parallel, /report and unknown subtypes
            handleMixedPart(multipart, buffer, rdf);
        }
    }
}
Also used : BodyPart(javax.mail.BodyPart) Multipart(javax.mail.Multipart) ContentType(javax.mail.internet.ContentType) MessagingException(javax.mail.MessagingException) Model(org.ontoware.rdf2go.model.Model) DataHandler(javax.activation.DataHandler) URI(org.ontoware.rdf2go.model.node.URI)

Example 5 with URI

use of org.ontoware.rdf2go.model.node.URI in project stanbol by apache.

the class RDF2GoUtils method urifyBlankNodes.

public static void urifyBlankNodes(Model model) {
    HashMap<BlankNode, URI> nodeMap = new HashMap<BlankNode, URI>();
    Model add = RDF2Go.getModelFactory().createModel();
    add.open();
    Model remove = RDF2Go.getModelFactory().createModel();
    remove.open();
    for (Statement stmt : model) {
        RDFTerm subj = stmt.getSubject();
        URI pred = stmt.getPredicate();
        Node obj = stmt.getObject();
        boolean match = false;
        if (subj instanceof BlankNode) {
            match = true;
            URI newSubj = nodeMap.get(subj);
            if (newSubj == null) {
                newSubj = URIGenerator.createNewRandomUniqueURI();
                nodeMap.put(subj.asBlankNode(), newSubj);
            }
            subj = newSubj;
        }
        if (obj instanceof BlankNode) {
            match = true;
            URI newObj = nodeMap.get(obj);
            if (newObj == null) {
                newObj = URIGenerator.createNewRandomUniqueURI();
                nodeMap.put(obj.asBlankNode(), newObj);
            }
            obj = newObj;
        }
        if (match) {
            remove.addStatement(stmt);
            add.addStatement(subj, pred, obj);
        }
    }
    ClosableIterator<Statement> addIt = add.iterator();
    ClosableIterator<Statement> removeIt = remove.iterator();
    model.update(new DiffImpl(addIt, removeIt));
    addIt.close();
    removeIt.close();
    add.close();
    remove.close();
}
Also used : HashMap(java.util.HashMap) Statement(org.ontoware.rdf2go.model.Statement) Node(org.ontoware.rdf2go.model.node.Node) BlankNode(org.ontoware.rdf2go.model.node.BlankNode) BlankNode(org.ontoware.rdf2go.model.node.BlankNode) Model(org.ontoware.rdf2go.model.Model) DiffImpl(org.ontoware.rdf2go.model.impl.DiffImpl) RDFTerm(org.ontoware.rdf2go.model.node.RDFTerm) URI(org.ontoware.rdf2go.model.node.URI)

Aggregations

URI (org.ontoware.rdf2go.model.node.URI)5 Model (org.ontoware.rdf2go.model.Model)4 FileInputStream (java.io.FileInputStream)3 InputStream (java.io.InputStream)3 RDFContainer (org.semanticdesktop.aperture.rdf.RDFContainer)3 RDFContainerFactory (org.semanticdesktop.aperture.rdf.RDFContainerFactory)3 RDFContainerFactoryImpl (org.semanticdesktop.aperture.rdf.impl.RDFContainerFactoryImpl)3 ByteArrayInputStream (java.io.ByteArrayInputStream)2 File (java.io.File)2 URIImpl (org.ontoware.rdf2go.model.node.impl.URIImpl)2 BufferedInputStream (java.io.BufferedInputStream)1 Charset (java.nio.charset.Charset)1 HashMap (java.util.HashMap)1 DataHandler (javax.activation.DataHandler)1 BodyPart (javax.mail.BodyPart)1 MessagingException (javax.mail.MessagingException)1 Multipart (javax.mail.Multipart)1 ContentType (javax.mail.internet.ContentType)1 HtmlTextExtractUtil (org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlTextExtractUtil)1 InitializationException (org.apache.stanbol.enhancer.engines.metaxa.core.html.InitializationException)1