Search in sources :

Example 21 with DirectoryEntry

use of org.apache.poi.poifs.filesystem.DirectoryEntry in project tika by apache.

the class AbstractPOIFSExtractor method handleEmbeddedOfficeDoc.

/**
     * Handle an office document that's embedded at the POIFS level
     */
protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
    if (dir.hasEntry("Package")) {
        // It's OOXML (has a ZipFile):
        Entry ooxml = dir.getEntry("Package");
        try (TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml))) {
            ZipContainerDetector detector = new ZipContainerDetector();
            MediaType type = null;
            try {
                //if there's a stream error while detecting...
                type = detector.detect(stream, new Metadata());
            } catch (Exception e) {
                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
                return;
            }
            handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true);
            return;
        }
    }
    // It's regular OLE2:
    // What kind of document is it?
    Metadata metadata = new Metadata();
    metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
    if (dir.getStorageClsid() != null) {
        metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, dir.getStorageClsid().toString());
    }
    POIFSDocumentType type = POIFSDocumentType.detectType(dir);
    TikaInputStream embedded = null;
    String rName = (resourceName == null) ? dir.getName() : resourceName;
    try {
        if (type == POIFSDocumentType.OLE10_NATIVE) {
            try {
                // Try to un-wrap the OLE10Native record:
                Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
                if (ole.getLabel() != null) {
                    metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
                }
                if (ole.getCommand() != null) {
                    metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
                }
                if (ole.getFileName() != null) {
                    metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
                }
                byte[] data = ole.getDataBuffer();
                embedded = TikaInputStream.get(data);
            } catch (Ole10NativeException ex) {
            // Not a valid OLE10Native record, skip it
            } catch (Exception e) {
                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
                return;
            }
        } else if (type == POIFSDocumentType.COMP_OBJ) {
            try {
                //TODO: figure out if the equivalent of OLE 1.0's
                //getCommand() and getFileName() exist for OLE 2.0 to populate
                //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
                // Grab the contents and process
                DocumentEntry contentsEntry;
                try {
                    contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
                } catch (FileNotFoundException ioe) {
                    contentsEntry = (DocumentEntry) dir.getEntry("Contents");
                }
                DocumentInputStream inp = new DocumentInputStream(contentsEntry);
                byte[] contents = new byte[contentsEntry.getSize()];
                inp.readFully(contents);
                embedded = TikaInputStream.get(contents);
                // Try to work out what it is
                MediaType mediaType = getDetector().detect(embedded, new Metadata());
                String extension = type.getExtension();
                try {
                    MimeType mimeType = getMimeTypes().forName(mediaType.toString());
                    extension = mimeType.getExtension();
                } catch (MimeTypeException mte) {
                // No details on this type are known
                }
                // Record what we can do about it
                metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
                metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
            } catch (Exception e) {
                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
                return;
            }
        } else {
            metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
            metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '.' + type.getExtension());
        }
        // Should we parse it?
        if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
            if (embedded == null) {
                // Make a TikaInputStream that just
                // passes the root directory of the
                // embedded document, and is otherwise
                // empty (byte[0]):
                embedded = TikaInputStream.get(new byte[0]);
                embedded.setOpenContainer(dir);
            }
            embeddedDocumentUtil.parseEmbedded(embedded, xhtml, metadata, true);
        }
    } catch (IOException e) {
        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
    } finally {
        if (embedded != null) {
            embedded.close();
        }
    }
}
Also used : ZipContainerDetector(org.apache.tika.parser.pkg.ZipContainerDetector) Ole10Native(org.apache.poi.poifs.filesystem.Ole10Native) Metadata(org.apache.tika.metadata.Metadata) FileNotFoundException(java.io.FileNotFoundException) TikaInputStream(org.apache.tika.io.TikaInputStream) POIFSDocumentType(org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType) IOException(java.io.IOException) DocumentInputStream(org.apache.poi.poifs.filesystem.DocumentInputStream) Ole10NativeException(org.apache.poi.poifs.filesystem.Ole10NativeException) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) SAXException(org.xml.sax.SAXException) MimeTypeException(org.apache.tika.mime.MimeTypeException) MimeType(org.apache.tika.mime.MimeType) Entry(org.apache.poi.poifs.filesystem.Entry) DocumentEntry(org.apache.poi.poifs.filesystem.DocumentEntry) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) Ole10NativeException(org.apache.poi.poifs.filesystem.Ole10NativeException) MimeTypeException(org.apache.tika.mime.MimeTypeException) DocumentEntry(org.apache.poi.poifs.filesystem.DocumentEntry) MediaType(org.apache.tika.mime.MediaType)

Example 22 with DirectoryEntry

use of org.apache.poi.poifs.filesystem.DirectoryEntry in project tika by apache.

the class WordExtractor method parse.

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
    HWPFDocument document;
    try {
        document = new HWPFDocument(root);
    } catch (org.apache.poi.EncryptedDocumentException e) {
        throw new EncryptedDocumentException(e);
    } catch (OldWordFileFormatException e) {
        parseWord6(root, xhtml);
        return;
    }
    extractSavedByMetadata(document);
    org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(document);
    HeaderStories headerFooter = new HeaderStories(document);
    // Grab the list of pictures. As far as we can tell,
    //  the pictures should be in order, and may be directly
    //  placed or referenced from an anchor
    PicturesTable pictureTable = document.getPicturesTable();
    PicturesSource pictures = new PicturesSource(document);
    // Do any headers, if present
    Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() };
    handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);
    // Do the main paragraph text
    Range r = document.getRange();
    ListManager listManager = new ListManager(document);
    for (int i = 0; i < r.numParagraphs(); i++) {
        Paragraph p = r.getParagraph(i);
        i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
    }
    if (officeParserConfig.getIncludeShapeBasedContent()) {
        // Do everything else
        for (String paragraph : wordExtractor.getMainTextboxText()) {
            xhtml.element("p", paragraph);
        }
    }
    for (String paragraph : wordExtractor.getFootnoteText()) {
        xhtml.element("p", paragraph);
    }
    for (String paragraph : wordExtractor.getCommentsText()) {
        xhtml.element("p", paragraph);
    }
    for (String paragraph : wordExtractor.getEndnoteText()) {
        xhtml.element("p", paragraph);
    }
    // Do any footers, if present
    Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() };
    handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);
    // Handle any pictures that we haven't output yet
    for (Picture p = pictures.nextUnclaimed(); p != null; ) {
        handlePictureCharacterRun(null, p, pictures, xhtml);
        p = pictures.nextUnclaimed();
    }
    // Handle any embeded office documents
    try {
        DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
        for (Entry entry : op) {
            if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            }
        }
    } catch (FileNotFoundException e) {
    }
}
Also used : EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) FileNotFoundException(java.io.FileNotFoundException) PicturesTable(org.apache.poi.hwpf.model.PicturesTable) Range(org.apache.poi.hwpf.usermodel.Range) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) Paragraph(org.apache.poi.hwpf.usermodel.Paragraph) HWPFDocument(org.apache.poi.hwpf.HWPFDocument) HeaderStories(org.apache.poi.hwpf.usermodel.HeaderStories) Entry(org.apache.poi.poifs.filesystem.Entry) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) SavedByEntry(org.apache.poi.hwpf.model.SavedByEntry) Picture(org.apache.poi.hwpf.usermodel.Picture) OldWordFileFormatException(org.apache.poi.hwpf.OldWordFileFormatException)

Example 23 with DirectoryEntry

use of org.apache.poi.poifs.filesystem.DirectoryEntry in project tika by apache.

the class ExcelExtractor method parse.

protected void parse(DirectoryNode root, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException {
    if (!root.hasEntry(WORKBOOK_ENTRY)) {
        if (root.hasEntry(BOOK_ENTRY)) {
            // Excel 5 / Excel 95 file
            // Records are in a different structure so needs a
            //  different parser to process them
            OldExcelExtractor extractor = new OldExcelExtractor(root);
            OldExcelParser.parse(extractor, xhtml);
            return;
        } else {
            // Corrupt file / very old file, just skip text extraction
            return;
        }
    }
    // If a password was supplied, use it, otherwise the default
    Biff8EncryptionKey.setCurrentUserPassword(getPassword());
    // Have the file processed in event mode
    TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
    listener.processFile(root, isListenForAllRecords());
    listener.throwStoredException();
    for (Entry entry : root) {
        if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) {
            try {
                handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
            } catch (TikaException e) {
            // ignore parse errors from embedded documents
            }
        }
    }
}
Also used : OldExcelExtractor(org.apache.poi.hssf.extractor.OldExcelExtractor) Entry(org.apache.poi.poifs.filesystem.Entry) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) TikaException(org.apache.tika.exception.TikaException) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry)

Aggregations

DirectoryEntry (org.apache.poi.poifs.filesystem.DirectoryEntry)23 Entry (org.apache.poi.poifs.filesystem.Entry)12 IOException (java.io.IOException)9 FileNotFoundException (java.io.FileNotFoundException)7 ByteArrayInputStream (java.io.ByteArrayInputStream)5 File (java.io.File)5 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)5 InputStream (java.io.InputStream)4 DocumentSummaryInformation (org.apache.poi.hpsf.DocumentSummaryInformation)4 DirectoryNode (org.apache.poi.poifs.filesystem.DirectoryNode)4 ByteArrayOutputStream (java.io.ByteArrayOutputStream)3 FileInputStream (java.io.FileInputStream)3 POIFSFileSystem (org.apache.poi.poifs.filesystem.POIFSFileSystem)3 Test (org.junit.Test)3 FileOutputStream (java.io.FileOutputStream)2 OutputStream (java.io.OutputStream)2 ArrayList (java.util.ArrayList)2 POITextExtractor (org.apache.poi.POITextExtractor)2 SummaryInformation (org.apache.poi.hpsf.SummaryInformation)2 MAPIMessage (org.apache.poi.hsmf.MAPIMessage)2