Search in sources :

Example 1 with AbstractHemfComment

use of org.apache.poi.hemf.record.AbstractHemfComment in project poi by apache.

the class HemfExtractorTest method testBasicMac.

@Test
public void testBasicMac() throws Exception {
    InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("SimpleEMF_mac.emf");
    HemfExtractor ex = new HemfExtractor(is);
    HemfHeader header = ex.getHeader();
    int records = 0;
    boolean extractedData = false;
    for (HemfRecord record : ex) {
        if (record.getRecordType() == HemfRecordType.comment) {
            AbstractHemfComment comment = ((HemfCommentRecord) record).getComment();
            if (comment instanceof HemfCommentPublic.MultiFormats) {
                for (HemfCommentPublic.HemfMultiFormatsData d : ((HemfCommentPublic.MultiFormats) comment).getData()) {
                    byte[] data = d.getData();
                    //make sure header starts at 0
                    assertEquals('%', data[0]);
                    assertEquals('P', data[1]);
                    assertEquals('D', data[2]);
                    assertEquals('F', data[3]);
                    //make sure byte array ends at EOF\n
                    assertEquals('E', data[data.length - 4]);
                    assertEquals('O', data[data.length - 3]);
                    assertEquals('F', data[data.length - 2]);
                    assertEquals('\n', data[data.length - 1]);
                    extractedData = true;
                }
            }
        }
        records++;
    }
    assertTrue(extractedData);
    assertEquals(header.getRecords() - 1, records);
}
Also used : InputStream(java.io.InputStream) AbstractHemfComment(org.apache.poi.hemf.record.AbstractHemfComment) HemfCommentPublic(org.apache.poi.hemf.record.HemfCommentPublic) HemfRecord(org.apache.poi.hemf.record.HemfRecord) HemfCommentRecord(org.apache.poi.hemf.record.HemfCommentRecord) HemfHeader(org.apache.poi.hemf.record.HemfHeader) Test(org.junit.Test)

Example 2 with AbstractHemfComment

use of org.apache.poi.hemf.record.AbstractHemfComment in project tika by apache.

the class EMFParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    EmbeddedDocumentExtractor embeddedDocumentExtractor = null;
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    try {
        HemfExtractor ex = new HemfExtractor(stream);
        long lastY = -1;
        long lastX = -1;
        //derive this from the font or frame/bounds information
        long fudgeFactorX = 1000;
        StringBuilder buffer = new StringBuilder();
        for (HemfRecord record : ex) {
            if (record.getRecordType() == HemfRecordType.comment) {
                AbstractHemfComment comment = ((HemfCommentRecord) record).getComment();
                if (comment instanceof HemfCommentPublic.MultiFormats) {
                    if (embeddedDocumentExtractor == null) {
                        embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
                    }
                    handleMultiFormats((HemfCommentPublic.MultiFormats) comment, xhtml, embeddedDocumentExtractor);
                } else if (comment instanceof HemfCommentPublic.WindowsMetafile) {
                    if (embeddedDocumentExtractor == null) {
                        embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
                    }
                    handleWMF((HemfCommentPublic.WindowsMetafile) comment, xhtml, embeddedDocumentExtractor);
                }
            } else if (record.getRecordType().equals(HemfRecordType.exttextoutw)) {
                HemfText.ExtTextOutW extTextOutW = (HemfText.ExtTextOutW) record;
                if (lastY > -1 && lastY != extTextOutW.getY()) {
                    xhtml.startElement("p");
                    xhtml.characters(buffer.toString());
                    xhtml.endElement("p");
                    buffer.setLength(0);
                    lastX = -1;
                }
                if (lastX > -1 && extTextOutW.getX() - lastX > fudgeFactorX) {
                    buffer.append(" ");
                }
                String txt = extTextOutW.getText();
                buffer.append(txt);
                lastY = extTextOutW.getY();
                lastX = extTextOutW.getX();
            }
        }
        if (buffer.length() > 0) {
            xhtml.startElement("p");
            xhtml.characters(buffer.toString());
            xhtml.endElement("p");
        }
    } catch (RecordFormatException e) {
        //POI's hemfparser can throw these for "parse exceptions"
        throw new TikaException(e.getMessage(), e);
    } catch (RuntimeException e) {
        //convert Runtime to RecordFormatExceptions
        throw new TikaException(e.getMessage(), e);
    }
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) HemfRecord(org.apache.poi.hemf.record.HemfRecord) HemfCommentRecord(org.apache.poi.hemf.record.HemfCommentRecord) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) HemfText(org.apache.poi.hemf.record.HemfText) RecordFormatException(org.apache.poi.util.RecordFormatException) AbstractHemfComment(org.apache.poi.hemf.record.AbstractHemfComment) HemfCommentPublic(org.apache.poi.hemf.record.HemfCommentPublic) HemfExtractor(org.apache.poi.hemf.extractor.HemfExtractor)

Aggregations

AbstractHemfComment (org.apache.poi.hemf.record.AbstractHemfComment)2 HemfCommentPublic (org.apache.poi.hemf.record.HemfCommentPublic)2 HemfCommentRecord (org.apache.poi.hemf.record.HemfCommentRecord)2 HemfRecord (org.apache.poi.hemf.record.HemfRecord)2 InputStream (java.io.InputStream)1 HemfExtractor (org.apache.poi.hemf.extractor.HemfExtractor)1 HemfHeader (org.apache.poi.hemf.record.HemfHeader)1 HemfText (org.apache.poi.hemf.record.HemfText)1 RecordFormatException (org.apache.poi.util.RecordFormatException)1 TikaException (org.apache.tika.exception.TikaException)1 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)1 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)1 Test (org.junit.Test)1