Search in sources :

Example 1 with ConvertedDocument

use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.

the class WebArchiveConverter method conversionImplementation.

/**
     * Convert MHT or .webarchive file to pure text.
     * Alternatively, export "archive" exploded on disk and then convert all children items.
     * See MessageConverter base and ArchiveNavigator solutions for that.
     *
     * @param in stream
     * @param doc original file
     */
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
    TikaHTMLConverter htmlParser = new TikaHTMLConverter(false);
    DefaultConverter objectParser = new DefaultConverter();
    ConvertedDocument d = super.conversionImplementation(in, doc);
    d.is_webArchive = true;
    if (!d.hasRawChildren()) {
        return d;
    }
    StringBuilder buf = new StringBuilder();
    for (Content binary : d.getRawChildren()) {
        logger.info("{} {} {}", d.id, binary.id, binary.mimeType);
        if (binary.mimeType == null) {
            continue;
        }
        if ("application/octet-stream".equalsIgnoreCase(binary.mimeType)) {
            ConvertedDocument obj = objectParser.convert(TikaInputStream.get(binary.content));
            if (obj != null && obj.hasText() && !isWebScript(obj.getText())) {
                buf.append(obj.getText());
                buf.append("\n==================\n");
            }
        } else if (binary.mimeType.startsWith("text/html")) {
            ConvertedDocument htmlDoc = htmlParser.convert(TikaInputStream.get(binary.content));
            if (htmlDoc != null && htmlDoc.hasText() && !isWebScript(htmlDoc.getText())) {
                // Filter out HTML crap -- comments, javascript, etc. that comes through as octet-stream in these archives.
                buf.append(htmlDoc.getText());
                buf.append("\n==================\n");
            }
        } else if (binary.mimeType.startsWith("image")) {
            buf.append(String.format("\n[Image: %s type='%s']  ", binary.id, binary.mimeType));
        }
    }
    if (d.hasText()) {
        d.setText(d.getText() + "\n\n==================\n\n" + buf.toString());
    } else {
        d.setText(buf.toString());
    }
    return d;
}
Also used : Content(org.opensextant.xtext.Content) ConvertedDocument(org.opensextant.xtext.ConvertedDocument)

Example 2 with ConvertedDocument

use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.

the class Decomposer method main.

public static void main(String[] args) {
    gnu.getopt.Getopt opts = new gnu.getopt.Getopt("Decomposer", args, "hei:o:");
    String input = null;
    String output = null;
    boolean embed = false;
    try {
        int c;
        while ((c = opts.getopt()) != -1) {
            switch(c) {
                case 'i':
                    input = opts.getOptarg();
                    break;
                case 'o':
                    output = opts.getOptarg();
                    break;
                case 'e':
                    embed = true;
                    System.out.println("Saving conversions to Input folder.  Output folder will be ignored.");
                    break;
                default:
                    Decomposer.usage();
                    System.exit(1);
            }
        }
    } catch (Exception err) {
        Decomposer.usage();
        System.exit(1);
    }
    EmbeddedContentConverter conv = new EmbeddedContentConverter(0x200000);
    ConvertedDocument d;
    try {
        d = conv.convert(new File(input));
        System.out.println("Found Doc:" + d.getFilepath());
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : EmbeddedContentConverter(org.opensextant.xtext.converters.EmbeddedContentConverter) IOException(java.io.IOException) ConvertedDocument(org.opensextant.xtext.ConvertedDocument) File(java.io.File) IOException(java.io.IOException)

Example 3 with ConvertedDocument

use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.

the class TextTranscodingConverter method conversionImplementation.

/**
     * A converter that tries to get a decent encoding ASCII, UTF-8 or other,
     * and then the buffer converted or not.
     *
     * IF ASCII OR UTF-8 accept file as is, do not convert, alter buffer...
     * ELSE file must be read in and converted.
     *
     * CAVEAT: If file is short and low-confidence for encoding detection ALSO
     * do not convert. Treat as a plain text file.
     */
@Override
protected ConvertedDocument conversionImplementation(java.io.InputStream in, java.io.File doc) throws IOException {
    ConvertedDocument textdoc = new ConvertedDocument(doc);
    byte[] data = null;
    if (in != null) {
        // Get byte data from input stream or file
        if (doc != null) {
            data = FileUtility.readBytesFrom(doc);
        } else {
            data = IOUtils.toByteArray(in);
        }
        in.close();
    }
    // Encoding heuristics here.....
    //
    // Objective:  mark small plain text payloads with unknown character set
    //             as not worthy of conversion.  Leave them as plain/text
    //             indeed they might even be straight Unicode
    //
    // Test for ASCII only first, otherwise try to detect the best charset for the text
    //
    textdoc.is_plaintext = true;
    boolean is_ascii = TextUtils.isASCII(data);
    if (is_ascii) {
        textdoc.do_convert = false;
        textdoc.setEncoding("ASCII");
        textdoc.setText(new String(data));
    } else {
        chardet.setText(data);
        CharsetMatch cs = chardet.detect();
        if (ConvertedDocument.OUTPUT_ENCODING.equalsIgnoreCase(cs.getName())) {
            textdoc.do_convert = false;
        } else if (data.length < IGNORE_THRESHOLD_SIZE && cs.getConfidence() < IGNORE_THRESHOLD_CONF) {
            textdoc.do_convert = false;
        }
        textdoc.setEncoding(cs.getName());
        textdoc.setText(new String(data, cs.getName()));
    }
    return textdoc;
}
Also used : CharsetMatch(com.ibm.icu.text.CharsetMatch) ConvertedDocument(org.opensextant.xtext.ConvertedDocument)

Example 4 with ConvertedDocument

use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.

the class MessageConverter method convertMimeMessage.

/**
     * Convert the MIME Message with or without the File doc.
     *  -- live email capture from a mailbox:  you have the MimeMessage; there is no File object
     *  -- email capture from a filesystem:   you retrieved the MimeMessage from a File object
     *
     * @param msg javamail Message obj
     * @param doc converted doc for given message
     * @return doc conversion, likely a parent document with 1 or more child attachments
     * @throws MessagingException on err
     * @throws IOException on err
     */
public ConvertedDocument convertMimeMessage(Message msg, File doc) throws MessagingException, IOException {
    ConvertedDocument parentMsgDoc = new ConvertedDocument(doc);
    parentMsgDoc.is_RFC822_attachment = true;
    //parentMsgDoc.setEncoding(parseCharset(msg.getContentType()));
    setMailAttributes(parentMsgDoc, msg);
    StringBuilder rawText = new StringBuilder();
    // Since content is taken from file system, use file name
    String messageFilePrefix = (doc != null ? FilenameUtils.getBaseName(doc.getName()) : parentMsgDoc.id);
    // Find all attachments and plain text.
    parseMessage(msg, parentMsgDoc, rawText, messageFilePrefix);
    parentMsgDoc.setText(rawText.toString());
    return parentMsgDoc;
}
Also used : ConvertedDocument(org.opensextant.xtext.ConvertedDocument)

Example 5 with ConvertedDocument

use of org.opensextant.xtext.ConvertedDocument in project Xponents by OpenSextant.

the class EmbeddedContentConverter method conversionImplementation.

/**
     * Convert Embedded documents in the supported types to a folder of the embedded items.
     * Trivial embedded icons and other components will not be extracted
     *
     */
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
    ConvertedDocument compoundDoc = super.conversionImplementation(in, doc);
    String ext = FilenameUtils.getExtension(doc.getName());
    if (!isSupported(ext)) {
        // Not really compound by our standards here.
        return compoundDoc;
    }
    ParserContainerExtractor extractor = new ParserContainerExtractor();
    EmbeddedObjectExtractor objExtractor = new EmbeddedObjectExtractor(compoundDoc, true);
    TikaInputStream tikaStream = null;
    try {
        tikaStream = TikaInputStream.get(doc.toPath());
        extractor.extract(tikaStream, extractor, objExtractor);
        compoundDoc.is_converted = true;
        if (compoundDoc.hasRawChildren()) {
            // Create text buffer for this compound document here.
            // If raw children should be post-processed by some other means, that is up to caller.
            // This parent document at least contains a complete text representation of the content in the original doc.
            StringBuilder completeText = new StringBuilder();
            completeText.append(compoundDoc.getText());
            completeText.append("\n==Embedded Objects==\n");
            completeText.append(renderText(compoundDoc.getRawChildren()));
            compoundDoc.setText(completeText.toString());
            compoundDoc.is_converted = true;
            return compoundDoc;
        } else {
            // Try the simple approach.
            return compoundDoc;
        }
    } catch (Exception e) {
        throw new IOException("Stream parsing problem", e);
    } finally {
        tikaStream.close();
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException) ConvertedDocument(org.opensextant.xtext.ConvertedDocument) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) IOException(java.io.IOException) MimeTypeException(org.apache.tika.mime.MimeTypeException)

Aggregations

ConvertedDocument (org.opensextant.xtext.ConvertedDocument)17 IOException (java.io.IOException)11 Content (org.opensextant.xtext.Content)4 File (java.io.File)3 Metadata (org.apache.tika.metadata.Metadata)3 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)3 HashMap (java.util.HashMap)2 Test (org.junit.Test)2 CharsetMatch (com.ibm.icu.text.CharsetMatch)1 StringWriter (java.io.StringWriter)1 URL (java.net.URL)1 ParseException (java.text.ParseException)1 MimeType (javax.activation.MimeType)1 MessagingException (javax.mail.MessagingException)1 PDDocument (org.apache.pdfbox.pdmodel.PDDocument)1 PDDocumentInformation (org.apache.pdfbox.pdmodel.PDDocumentInformation)1 MAPIMessage (org.apache.poi.hsmf.MAPIMessage)1 AttachmentChunks (org.apache.poi.hsmf.datatypes.AttachmentChunks)1 ChunkNotFoundException (org.apache.poi.hsmf.exceptions.ChunkNotFoundException)1 WordExtractor (org.apache.poi.hwpf.extractor.WordExtractor)1