Search in sources :

Example 31 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class MboxParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
    EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
    String charsetName = "windows-1252";
    metadata.set(Metadata.CONTENT_TYPE, MBOX_MIME_TYPE);
    metadata.set(Metadata.CONTENT_ENCODING, charsetName);
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    InputStreamReader isr = new InputStreamReader(stream, charsetName);
    try (BufferedReader reader = new BufferedReader(isr)) {
        String curLine = reader.readLine();
        int mailItem = 0;
        do {
            if (curLine.startsWith(MBOX_RECORD_DIVIDER)) {
                Metadata mailMetadata = new Metadata();
                Queue<String> multiline = new LinkedList<String>();
                mailMetadata.add(EMAIL_FROMLINE_METADATA, curLine.substring(MBOX_RECORD_DIVIDER.length()));
                mailMetadata.set(Metadata.CONTENT_TYPE, "message/rfc822");
                curLine = reader.readLine();
                if (curLine == null) {
                    break;
                }
                ByteArrayOutputStream message = new ByteArrayOutputStream(100000);
                do {
                    if (curLine.startsWith(" ") || curLine.startsWith("\t")) {
                        String latestLine = multiline.poll();
                        latestLine += " " + curLine.trim();
                        multiline.add(latestLine);
                    } else {
                        multiline.add(curLine);
                    }
                    message.write(curLine.getBytes(charsetName));
                    message.write(0x0A);
                    curLine = reader.readLine();
                } while (curLine != null && !curLine.startsWith(MBOX_RECORD_DIVIDER) && message.size() < MAIL_MAX_SIZE);
                for (String item : multiline) {
                    saveHeaderInMetadata(mailMetadata, item);
                }
                ByteArrayInputStream messageStream = new ByteArrayInputStream(message.toByteArray());
                message = null;
                if (extractor.shouldParseEmbedded(mailMetadata)) {
                    extractor.parseEmbedded(messageStream, xhtml, mailMetadata, true);
                }
                if (tracking) {
                    getTrackingMetadata().put(mailItem++, mailMetadata);
                }
            } else {
                curLine = reader.readLine();
            }
        } while (curLine != null && !Thread.currentThread().isInterrupted());
    }
    xhtml.endDocument();
}
Also used : EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) InputStreamReader(java.io.InputStreamReader) ByteArrayInputStream(java.io.ByteArrayInputStream) BufferedReader(java.io.BufferedReader) Metadata(org.apache.tika.metadata.Metadata) ByteArrayOutputStream(java.io.ByteArrayOutputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) LinkedList(java.util.LinkedList)

Example 32 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class OutlookPSTParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Use the delegate parser to parse the contained document
    EmbeddedDocumentExtractor embeddedExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
    metadata.set(Metadata.CONTENT_TYPE, MS_OUTLOOK_PST_MIMETYPE.toString());
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    TikaInputStream in = TikaInputStream.get(stream);
    PSTFile pstFile = null;
    try {
        pstFile = new PSTFile(in.getFile().getPath());
        metadata.set(Metadata.CONTENT_LENGTH, valueOf(pstFile.getFileHandle().length()));
        boolean isValid = pstFile.getFileHandle().getFD().valid();
        metadata.set("isValid", valueOf(isValid));
        if (isValid) {
            parseFolder(xhtml, pstFile.getRootFolder(), embeddedExtractor);
        }
    } catch (Exception e) {
        throw new TikaException(e.getMessage(), e);
    } finally {
        if (pstFile != null && pstFile.getFileHandle() != null) {
            try {
                pstFile.getFileHandle().close();
            } catch (IOException e) {
            //swallow closing exception
            }
        }
    }
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) PSTFile(com.pff.PSTFile) TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) PSTException(com.pff.PSTException) SAXException(org.xml.sax.SAXException)

Example 33 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class EMFParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    EmbeddedDocumentExtractor embeddedDocumentExtractor = null;
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    try {
        HemfExtractor ex = new HemfExtractor(stream);
        long lastY = -1;
        long lastX = -1;
        //derive this from the font or frame/bounds information
        long fudgeFactorX = 1000;
        StringBuilder buffer = new StringBuilder();
        for (HemfRecord record : ex) {
            if (record.getRecordType() == HemfRecordType.comment) {
                AbstractHemfComment comment = ((HemfCommentRecord) record).getComment();
                if (comment instanceof HemfCommentPublic.MultiFormats) {
                    if (embeddedDocumentExtractor == null) {
                        embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
                    }
                    handleMultiFormats((HemfCommentPublic.MultiFormats) comment, xhtml, embeddedDocumentExtractor);
                } else if (comment instanceof HemfCommentPublic.WindowsMetafile) {
                    if (embeddedDocumentExtractor == null) {
                        embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
                    }
                    handleWMF((HemfCommentPublic.WindowsMetafile) comment, xhtml, embeddedDocumentExtractor);
                }
            } else if (record.getRecordType().equals(HemfRecordType.exttextoutw)) {
                HemfText.ExtTextOutW extTextOutW = (HemfText.ExtTextOutW) record;
                if (lastY > -1 && lastY != extTextOutW.getY()) {
                    xhtml.startElement("p");
                    xhtml.characters(buffer.toString());
                    xhtml.endElement("p");
                    buffer.setLength(0);
                    lastX = -1;
                }
                if (lastX > -1 && extTextOutW.getX() - lastX > fudgeFactorX) {
                    buffer.append(" ");
                }
                String txt = extTextOutW.getText();
                buffer.append(txt);
                lastY = extTextOutW.getY();
                lastX = extTextOutW.getX();
            }
        }
        if (buffer.length() > 0) {
            xhtml.startElement("p");
            xhtml.characters(buffer.toString());
            xhtml.endElement("p");
        }
    } catch (RecordFormatException e) {
        //POI's hemfparser can throw these for "parse exceptions"
        throw new TikaException(e.getMessage(), e);
    } catch (RuntimeException e) {
        //convert Runtime to RecordFormatExceptions
        throw new TikaException(e.getMessage(), e);
    }
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) HemfRecord(org.apache.poi.hemf.record.HemfRecord) HemfCommentRecord(org.apache.poi.hemf.record.HemfCommentRecord) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) HemfText(org.apache.poi.hemf.record.HemfText) RecordFormatException(org.apache.poi.util.RecordFormatException) AbstractHemfComment(org.apache.poi.hemf.record.AbstractHemfComment) HemfCommentPublic(org.apache.poi.hemf.record.HemfCommentPublic) HemfExtractor(org.apache.poi.hemf.extractor.HemfExtractor)

Example 34 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class AbstractOOXMLExtractor method getXHTML.

/**
     * @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(ContentHandler, Metadata, ParseContext)
     */
public void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context) throws SAXException, XmlException, IOException, TikaException {
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    buildXHTML(xhtml);
    // Now do any embedded parts
    handleEmbeddedParts(handler, metadata);
    // thumbnail
    handleThumbnail(handler);
    xhtml.endDocument();
}
Also used : XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler)

Example 35 with XHTMLContentHandler

use of org.apache.tika.sax.XHTMLContentHandler in project tika by apache.

the class WMFParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    try {
        HwmfPicture picture = new HwmfPicture(stream);
        //to determine when to keep two text parts on the same line
        for (HwmfRecord record : picture.getRecords()) {
            Charset charset = LocaleUtil.CHARSET_1252;
            //This fix should be done within POI
            if (record.getRecordType().equals(HwmfRecordType.createFontIndirect)) {
                HwmfFont font = ((HwmfText.WmfCreateFontIndirect) record).getFont();
                charset = (font.getCharSet() == null || font.getCharSet().getCharset() == null) ? LocaleUtil.CHARSET_1252 : font.getCharSet().getCharset();
            }
            if (record.getRecordType().equals(HwmfRecordType.extTextOut)) {
                HwmfText.WmfExtTextOut textOut = (HwmfText.WmfExtTextOut) record;
                xhtml.startElement("p");
                xhtml.characters(textOut.getText(charset));
                xhtml.endElement("p");
            } else if (record.getRecordType().equals(HwmfRecordType.textOut)) {
                HwmfText.WmfTextOut textOut = (HwmfText.WmfTextOut) record;
                xhtml.startElement("p");
                xhtml.characters(textOut.getText(charset));
                xhtml.endElement("p");
            }
        }
    } catch (RecordFormatException e) {
        //POI's hwmfparser can throw these for "parse exceptions"
        throw new TikaException(e.getMessage(), e);
    } catch (RuntimeException e) {
        //convert Runtime to RecordFormatExceptions
        throw new TikaException(e.getMessage(), e);
    } catch (AssertionError e) {
        //POI's hwmfparser can throw these for parse exceptions
        throw new TikaException(e.getMessage(), e);
    }
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) HwmfRecord(org.apache.poi.hwmf.record.HwmfRecord) Charset(java.nio.charset.Charset) HwmfText(org.apache.poi.hwmf.record.HwmfText) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) HwmfFont(org.apache.poi.hwmf.record.HwmfFont) HwmfPicture(org.apache.poi.hwmf.usermodel.HwmfPicture) RecordFormatException(org.apache.poi.util.RecordFormatException)

Aggregations

XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)72 TikaException (org.apache.tika.exception.TikaException)26 TikaInputStream (org.apache.tika.io.TikaInputStream)22 TemporaryResources (org.apache.tika.io.TemporaryResources)14 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)13 IOException (java.io.IOException)12 SAXException (org.xml.sax.SAXException)9 File (java.io.File)6 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)6 Metadata (org.apache.tika.metadata.Metadata)6 BufferedInputStream (java.io.BufferedInputStream)5 InputStream (java.io.InputStream)5 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 Charset (java.nio.charset.Charset)4 ArrayList (java.util.ArrayList)4 Map (java.util.Map)4 MediaType (org.apache.tika.mime.MediaType)4 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)4 InputStreamReader (java.io.InputStreamReader)3