Search in sources :

Example 11 with Entry

use of org.apache.poi.poifs.filesystem.Entry in project poi by apache.

the class HWPFDocument method write.

private void write(NPOIFSFileSystem pfs, boolean copyOtherEntries) throws IOException {
    // initialize our streams for writing.
    HWPFFileSystem docSys = new HWPFFileSystem();
    HWPFOutputStream wordDocumentStream = docSys.getStream(STREAM_WORD_DOCUMENT);
    HWPFOutputStream tableStream = docSys.getStream(STREAM_TABLE_1);
    //HWPFOutputStream dataStream = docSys.getStream("Data");
    int tableOffset = 0;
    // FileInformationBlock fib = (FileInformationBlock)_fib.clone();
    // clear the offsets and sizes in our FileInformationBlock.
    _fib.clearOffsetsSizes();
    // determine the FileInformationBLock size
    int fibSize = _fib.getSize();
    fibSize += POIFSConstants.SMALLER_BIG_BLOCK_SIZE - (fibSize % POIFSConstants.SMALLER_BIG_BLOCK_SIZE);
    // preserve space for the FileInformationBlock because we will be writing
    // it after we write everything else.
    byte[] placeHolder = new byte[fibSize];
    wordDocumentStream.write(placeHolder);
    int mainOffset = wordDocumentStream.getOffset();
    // write out the StyleSheet.
    _fib.setFcStshf(tableOffset);
    _ss.writeTo(tableStream);
    _fib.setLcbStshf(tableStream.getOffset() - tableOffset);
    tableOffset = tableStream.getOffset();
    // get fcMin and fcMac because we will be writing the actual text with the
    // complex table.
    int fcMin = mainOffset;
    /*
         * clx (encoding of the sprm lists for a complex file and piece table
         * for a any file) Written immediately after the end of the previously
         * recorded structure. This is recorded in all Word documents
         * 
         * Microsoft Office Word 97-2007 Binary File Format (.doc)
         * Specification; Page 23 of 210
         */
    // write out the Complex table, includes text.
    _fib.setFcClx(tableOffset);
    _cft.writeTo(wordDocumentStream, tableStream);
    _fib.setLcbClx(tableStream.getOffset() - tableOffset);
    tableOffset = tableStream.getOffset();
    int fcMac = wordDocumentStream.getOffset();
    /*
         * dop (document properties record) Written immediately after the end of
         * the previously recorded structure. This is recorded in all Word
         * documents
         * 
         * Microsoft Office Word 97-2007 Binary File Format (.doc)
         * Specification; Page 23 of 210
         */
    // write out the DocumentProperties.
    _fib.setFcDop(tableOffset);
    _dop.writeTo(tableStream);
    _fib.setLcbDop(tableStream.getOffset() - tableOffset);
    tableOffset = tableStream.getOffset();
    /*
         * plcfBkmkf (table recording beginning CPs of bookmarks) Written
         * immediately after the sttbfBkmk, if the document contains bookmarks.
         * 
         * Microsoft Office Word 97-2007 Binary File Format (.doc)
         * Specification; Page 24 of 210
         */
    if (_bookmarksTables != null) {
        _bookmarksTables.writePlcfBkmkf(_fib, tableStream);
        tableOffset = tableStream.getOffset();
    }
    /*
         * plcfBkmkl (table recording limit CPs of bookmarks) Written
         * immediately after the plcfBkmkf, if the document contains bookmarks.
         * 
         * Microsoft Office Word 97-2007 Binary File Format (.doc)
         * Specification; Page 24 of 210
         */
    if (_bookmarksTables != null) {
        _bookmarksTables.writePlcfBkmkl(_fib, tableStream);
        tableOffset = tableStream.getOffset();
    }
    /*
         * plcfbteChpx (bin table for CHP FKPs) Written immediately after the
         * previously recorded table. This is recorded in all Word documents.
         * 
         * Microsoft Office Word 97-2007 Binary File Format (.doc)
         * Specification; Page 24 of 210
         */
    // write out the CHPBinTable.
    _fib.setFcPlcfbteChpx(tableOffset);
    _cbt.writeTo(wordDocumentStream, tableStream, fcMin, _cft.getTextPieceTable());
    _fib.setLcbPlcfbteChpx(tableStream.getOffset() - tableOffset);
    tableOffset = tableStream.getOffset();
    /*
         * plcfbtePapx (bin table for PAP FKPs) Written immediately after the
         * plcfbteChpx. This is recorded in all Word documents.
         * 
         * Microsoft Office Word 97-2007 Binary File Format (.doc)
         * Specification; Page 24 of 210
         */
    // write out the PAPBinTable.
    _fib.setFcPlcfbtePapx(tableOffset);
    _pbt.writeTo(wordDocumentStream, tableStream, _cft.getTextPieceTable());
    _fib.setLcbPlcfbtePapx(tableStream.getOffset() - tableOffset);
    tableOffset = tableStream.getOffset();
    /*
         * plcfendRef (endnote reference position table) Written immediately
         * after the previously recorded table if the document contains endnotes
         * 
         * plcfendTxt (endnote text position table) Written immediately after
         * the plcfendRef if the document contains endnotes
         * 
         * Microsoft Office Word 97-2007 Binary File Format (.doc)
         * Specification; Page 24 of 210
         */
    _endnotesTables.writeRef(_fib, tableStream);
    _endnotesTables.writeTxt(_fib, tableStream);
    tableOffset = tableStream.getOffset();
    if (_fieldsTables != null) {
        _fieldsTables.write(_fib, tableStream);
        tableOffset = tableStream.getOffset();
    }
    /*
         * plcffndRef (footnote reference position table) Written immediately
         * after the stsh if the document contains footnotes
         * 
         * plcffndTxt (footnote text position table) Written immediately after
         * the plcffndRef if the document contains footnotes
         * 
         * Microsoft Office Word 97-2007 Binary File Format (.doc)
         * Specification; Page 24 of 210
         */
    _footnotesTables.writeRef(_fib, tableStream);
    _footnotesTables.writeTxt(_fib, tableStream);
    tableOffset = tableStream.getOffset();
    /*
         * plcfsed (section table) Written immediately after the previously
         * recorded table. Recorded in all Word documents
         * 
         * Microsoft Office Word 97-2007 Binary File Format (.doc)
         * Specification; Page 25 of 210
         */
    // write out the SectionTable.
    _fib.setFcPlcfsed(tableOffset);
    _st.writeTo(wordDocumentStream, tableStream);
    _fib.setLcbPlcfsed(tableStream.getOffset() - tableOffset);
    tableOffset = tableStream.getOffset();
    // write out the list tables
    if (_lt != null) {
        /*
             * plcflst (list formats) Written immediately after the end of the
             * previously recorded, if there are any lists defined in the
             * document. This begins with a short count of LSTF structures
             * followed by those LSTF structures. This is immediately followed
             * by the allocated data hanging off the LSTFs. This data consists
             * of the array of LVLs for each LSTF. (Each LVL consists of an LVLF
             * followed by two grpprls and an XST.)
             * 
             * Microsoft Office Word 97-2007 Binary File Format (.doc)
             * Specification; Page 25 of 210
             */
        _lt.writeListDataTo(_fib, tableStream);
        tableOffset = tableStream.getOffset();
        /*
             * plflfo (more list formats) Written immediately after the end of
             * the plcflst and its accompanying data, if there are any lists
             * defined in the document. This consists first of a PL of LFO
             * records, followed by the allocated data (if any) hanging off the
             * LFOs. The allocated data consists of the array of LFOLVLFs for
             * each LFO (and each LFOLVLF is immediately followed by some LVLs).
             * 
             * Microsoft Office Word 97-2007 Binary File Format (.doc)
             * Specification; Page 26 of 210
             */
        _lt.writeListOverridesTo(_fib, tableStream);
        tableOffset = tableStream.getOffset();
    }
    /*
         * sttbfBkmk (table of bookmark name strings) Written immediately after
         * the previously recorded table, if the document contains bookmarks.
         * 
         * Microsoft Office Word 97-2007 Binary File Format (.doc)
         * Specification; Page 27 of 210
         */
    if (_bookmarksTables != null) {
        _bookmarksTables.writeSttbfBkmk(_fib, tableStream);
        tableOffset = tableStream.getOffset();
    }
    // write out the saved-by table.
    if (_sbt != null) {
        _fib.setFcSttbSavedBy(tableOffset);
        _sbt.writeTo(tableStream);
        _fib.setLcbSttbSavedBy(tableStream.getOffset() - tableOffset);
        tableOffset = tableStream.getOffset();
    }
    // write out the revision mark authors table.
    if (_rmat != null) {
        _fib.setFcSttbfRMark(tableOffset);
        _rmat.writeTo(tableStream);
        _fib.setLcbSttbfRMark(tableStream.getOffset() - tableOffset);
        tableOffset = tableStream.getOffset();
    }
    // write out the FontTable.
    _fib.setFcSttbfffn(tableOffset);
    _ft.writeTo(tableStream);
    _fib.setLcbSttbfffn(tableStream.getOffset() - tableOffset);
    tableOffset = tableStream.getOffset();
    // set some variables in the FileInformationBlock.
    _fib.getFibBase().setFcMin(fcMin);
    _fib.getFibBase().setFcMac(fcMac);
    _fib.setCbMac(wordDocumentStream.getOffset());
    // make sure that the table, doc and data streams use big blocks.
    byte[] mainBuf = wordDocumentStream.toByteArray();
    if (mainBuf.length < 4096) {
        byte[] tempBuf = new byte[4096];
        System.arraycopy(mainBuf, 0, tempBuf, 0, mainBuf.length);
        mainBuf = tempBuf;
    }
    // Table1 stream will be used
    _fib.getFibBase().setFWhichTblStm(true);
    // write out the FileInformationBlock.
    //_fib.serialize(mainBuf, 0);
    _fib.writeTo(mainBuf, tableStream);
    byte[] tableBuf = tableStream.toByteArray();
    if (tableBuf.length < 4096) {
        byte[] tempBuf = new byte[4096];
        System.arraycopy(tableBuf, 0, tempBuf, 0, tableBuf.length);
        tableBuf = tempBuf;
    }
    byte[] dataBuf = _dataStream;
    if (dataBuf == null) {
        dataBuf = new byte[4096];
    }
    if (dataBuf.length < 4096) {
        byte[] tempBuf = new byte[4096];
        System.arraycopy(dataBuf, 0, tempBuf, 0, dataBuf.length);
        dataBuf = tempBuf;
    }
    // Create a new document preserving order of entries / Update existing
    boolean docWritten = false;
    boolean dataWritten = false;
    boolean objectPoolWritten = false;
    boolean tableWritten = false;
    boolean propertiesWritten = false;
    for (Entry entry : getDirectory()) {
        if (entry.getName().equals(STREAM_WORD_DOCUMENT)) {
            if (!docWritten) {
                write(pfs, mainBuf, STREAM_WORD_DOCUMENT);
                docWritten = true;
            }
        } else if (entry.getName().equals(STREAM_OBJECT_POOL)) {
            if (!objectPoolWritten) {
                if (copyOtherEntries) {
                    _objectPool.writeTo(pfs.getRoot());
                } else {
                // Object pool is already there, no need to change/copy
                }
                objectPoolWritten = true;
            }
        } else if (entry.getName().equals(STREAM_TABLE_0) || entry.getName().equals(STREAM_TABLE_1)) {
            if (!tableWritten) {
                write(pfs, tableBuf, STREAM_TABLE_1);
                tableWritten = true;
            }
        } else if (entry.getName().equals(SummaryInformation.DEFAULT_STREAM_NAME) || entry.getName().equals(DocumentSummaryInformation.DEFAULT_STREAM_NAME)) {
            if (!propertiesWritten) {
                writeProperties(pfs);
                propertiesWritten = true;
            }
        } else if (entry.getName().equals(STREAM_DATA)) {
            if (!dataWritten) {
                write(pfs, dataBuf, STREAM_DATA);
                dataWritten = true;
            }
        } else if (copyOtherEntries) {
            EntryUtils.copyNodeRecursively(entry, pfs.getRoot());
        }
    }
    if (!docWritten)
        write(pfs, mainBuf, STREAM_WORD_DOCUMENT);
    if (!tableWritten)
        write(pfs, tableBuf, STREAM_TABLE_1);
    if (!propertiesWritten)
        writeProperties(pfs);
    if (!dataWritten)
        write(pfs, dataBuf, STREAM_DATA);
    if (!objectPoolWritten && copyOtherEntries)
        _objectPool.writeTo(pfs.getRoot());
    /*
         * since we updated all references in FIB and etc, using new arrays to
         * access data
         */
    replaceDirectory(pfs.getRoot());
    this._tableStream = tableStream.toByteArray();
    this._dataStream = dataBuf;
}
Also used : HWPFFileSystem(org.apache.poi.hwpf.model.io.HWPFFileSystem) Entry(org.apache.poi.poifs.filesystem.Entry) DocumentEntry(org.apache.poi.poifs.filesystem.DocumentEntry) HWPFOutputStream(org.apache.poi.hwpf.model.io.HWPFOutputStream)

Example 12 with Entry

use of org.apache.poi.poifs.filesystem.Entry in project poi by apache.

the class HWPFLister method dumpFileSystem.

private String dumpFileSystem(DirectoryEntry directory) {
    StringBuilder result = new StringBuilder();
    result.append("+ ");
    result.append(directory.getName());
    for (Iterator<Entry> iterator = directory.getEntries(); iterator.hasNext(); ) {
        Entry entry = iterator.next();
        String entryToString = "\n" + dumpFileSystem(entry);
        entryToString = entryToString.replaceAll("\n", "\n+---");
        result.append(entryToString);
    }
    result.append("\n");
    return result.toString();
}
Also used : Entry(org.apache.poi.poifs.filesystem.Entry) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry)

Example 13 with Entry

use of org.apache.poi.poifs.filesystem.Entry in project poi by apache.

the class TestWordExtractor method testBug51686.

/**
     * [RESOLVED FIXED] Bug 51686 - Update to POI 3.8 beta 4 causes
     * ConcurrentModificationException in Tika's OfficeParser
     */
@Test
public void testBug51686() throws IOException {
    InputStream is = docTests.openResourceAsStream("Bug51686.doc");
    POIFSFileSystem fs = new POIFSFileSystem(is);
    is.close();
    String text = null;
    for (Entry entry : fs.getRoot()) {
        if ("WordDocument".equals(entry.getName())) {
            WordExtractor ex = new WordExtractor(fs);
            try {
                text = ex.getText();
            } finally {
                ex.close();
            }
        }
    }
    assertNotNull(text);
    fs.close();
}
Also used : Entry(org.apache.poi.poifs.filesystem.Entry) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) OPOIFSFileSystem(org.apache.poi.poifs.filesystem.OPOIFSFileSystem) POIFSFileSystem(org.apache.poi.poifs.filesystem.POIFSFileSystem) NPOIFSFileSystem(org.apache.poi.poifs.filesystem.NPOIFSFileSystem) Test(org.junit.Test)

Example 14 with Entry

use of org.apache.poi.poifs.filesystem.Entry in project poi by apache.

the class TestEncryptor method listDir.

@SuppressWarnings("unused")
private void listDir(DirectoryNode dn, String ext, String path) throws IOException {
    path += "\\" + dn.getName().replace('', '_');
    System.out.println(ext + ": " + path + " (" + dn.getStorageClsid() + ")");
    Iterator<Entry> iter = dn.getEntries();
    while (iter.hasNext()) {
        Entry ent = iter.next();
        if (ent instanceof DirectoryNode) {
            listDir((DirectoryNode) ent, ext, path);
        } else {
            listEntry((DocumentNode) ent, ext, path);
        }
    }
}
Also used : Entry(org.apache.poi.poifs.filesystem.Entry) DirectoryNode(org.apache.poi.poifs.filesystem.DirectoryNode)

Example 15 with Entry

use of org.apache.poi.poifs.filesystem.Entry in project tika by apache.

the class RTFObjDataParser method handleEmbeddedPOIFS.

//will throw IOException if not actually POIFS
//can return null byte[]
private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException {
    byte[] ret = null;
    try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {
        DirectoryNode root = fs.getRoot();
        if (root == null) {
            return ret;
        }
        if (root.hasEntry("Package")) {
            Entry ooxml = root.getEntry("Package");
            TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            IOUtils.copy(stream, out);
            ret = out.toByteArray();
        } else {
            //try poifs
            POIFSDocumentType type = POIFSDocumentType.detectType(root);
            if (type == POIFSDocumentType.OLE10_NATIVE) {
                try {
                    // Try to un-wrap the OLE10Native record:
                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
                    ret = ole.getDataBuffer();
                } catch (Ole10NativeException ex) {
                // Not a valid OLE10Native record, skip it
                }
            } else if (type == POIFSDocumentType.COMP_OBJ) {
                DocumentEntry contentsEntry;
                try {
                    contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
                } catch (FileNotFoundException ioe) {
                    contentsEntry = (DocumentEntry) root.getEntry("Contents");
                }
                try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
                    ret = new byte[contentsEntry.getSize()];
                    inp.readFully(ret);
                }
            } else {
                ByteArrayOutputStream out = new ByteArrayOutputStream();
                is.reset();
                IOUtils.copy(is, out);
                ret = out.toByteArray();
                metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
            }
        }
    }
    return ret;
}
Also used : NPOIFSFileSystem(org.apache.poi.poifs.filesystem.NPOIFSFileSystem) Entry(org.apache.poi.poifs.filesystem.Entry) DocumentEntry(org.apache.poi.poifs.filesystem.DocumentEntry) Ole10NativeException(org.apache.poi.poifs.filesystem.Ole10NativeException) Ole10Native(org.apache.poi.poifs.filesystem.Ole10Native) DocumentEntry(org.apache.poi.poifs.filesystem.DocumentEntry) FileNotFoundException(java.io.FileNotFoundException) TikaInputStream(org.apache.tika.io.TikaInputStream) DirectoryNode(org.apache.poi.poifs.filesystem.DirectoryNode) POIFSDocumentType(org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DocumentInputStream(org.apache.poi.poifs.filesystem.DocumentInputStream)

Aggregations

Entry (org.apache.poi.poifs.filesystem.Entry)24 DirectoryEntry (org.apache.poi.poifs.filesystem.DirectoryEntry)12 IOException (java.io.IOException)9 DirectoryNode (org.apache.poi.poifs.filesystem.DirectoryNode)9 FileNotFoundException (java.io.FileNotFoundException)6 InputStream (java.io.InputStream)6 DocumentEntry (org.apache.poi.poifs.filesystem.DocumentEntry)6 DocumentInputStream (org.apache.poi.poifs.filesystem.DocumentInputStream)6 DocumentNode (org.apache.poi.poifs.filesystem.DocumentNode)4 POIFSFileSystem (org.apache.poi.poifs.filesystem.POIFSFileSystem)4 ArrayList (java.util.ArrayList)3 AttachmentChunks (org.apache.poi.hsmf.datatypes.AttachmentChunks)3 HWPFDocument (org.apache.poi.hwpf.HWPFDocument)3 OldWordFileFormatException (org.apache.poi.hwpf.OldWordFileFormatException)3 BufferedInputStream (java.io.BufferedInputStream)2 ByteArrayInputStream (java.io.ByteArrayInputStream)2 FileInputStream (java.io.FileInputStream)2 POITextExtractor (org.apache.poi.POITextExtractor)2 HSLFSlideShow (org.apache.poi.hslf.usermodel.HSLFSlideShow)2 MAPIMessage (org.apache.poi.hsmf.MAPIMessage)2