use of org.apache.poi.poifs.filesystem.Entry in project poi by apache.
the class HWPFDocument method write.
private void write(NPOIFSFileSystem pfs, boolean copyOtherEntries) throws IOException {
// initialize our streams for writing.
HWPFFileSystem docSys = new HWPFFileSystem();
HWPFOutputStream wordDocumentStream = docSys.getStream(STREAM_WORD_DOCUMENT);
HWPFOutputStream tableStream = docSys.getStream(STREAM_TABLE_1);
//HWPFOutputStream dataStream = docSys.getStream("Data");
int tableOffset = 0;
// FileInformationBlock fib = (FileInformationBlock)_fib.clone();
// clear the offsets and sizes in our FileInformationBlock.
_fib.clearOffsetsSizes();
// determine the FileInformationBLock size
int fibSize = _fib.getSize();
fibSize += POIFSConstants.SMALLER_BIG_BLOCK_SIZE - (fibSize % POIFSConstants.SMALLER_BIG_BLOCK_SIZE);
// preserve space for the FileInformationBlock because we will be writing
// it after we write everything else.
byte[] placeHolder = new byte[fibSize];
wordDocumentStream.write(placeHolder);
int mainOffset = wordDocumentStream.getOffset();
// write out the StyleSheet.
_fib.setFcStshf(tableOffset);
_ss.writeTo(tableStream);
_fib.setLcbStshf(tableStream.getOffset() - tableOffset);
tableOffset = tableStream.getOffset();
// get fcMin and fcMac because we will be writing the actual text with the
// complex table.
int fcMin = mainOffset;
/*
* clx (encoding of the sprm lists for a complex file and piece table
* for a any file) Written immediately after the end of the previously
* recorded structure. This is recorded in all Word documents
*
* Microsoft Office Word 97-2007 Binary File Format (.doc)
* Specification; Page 23 of 210
*/
// write out the Complex table, includes text.
_fib.setFcClx(tableOffset);
_cft.writeTo(wordDocumentStream, tableStream);
_fib.setLcbClx(tableStream.getOffset() - tableOffset);
tableOffset = tableStream.getOffset();
int fcMac = wordDocumentStream.getOffset();
/*
* dop (document properties record) Written immediately after the end of
* the previously recorded structure. This is recorded in all Word
* documents
*
* Microsoft Office Word 97-2007 Binary File Format (.doc)
* Specification; Page 23 of 210
*/
// write out the DocumentProperties.
_fib.setFcDop(tableOffset);
_dop.writeTo(tableStream);
_fib.setLcbDop(tableStream.getOffset() - tableOffset);
tableOffset = tableStream.getOffset();
/*
* plcfBkmkf (table recording beginning CPs of bookmarks) Written
* immediately after the sttbfBkmk, if the document contains bookmarks.
*
* Microsoft Office Word 97-2007 Binary File Format (.doc)
* Specification; Page 24 of 210
*/
if (_bookmarksTables != null) {
_bookmarksTables.writePlcfBkmkf(_fib, tableStream);
tableOffset = tableStream.getOffset();
}
/*
* plcfBkmkl (table recording limit CPs of bookmarks) Written
* immediately after the plcfBkmkf, if the document contains bookmarks.
*
* Microsoft Office Word 97-2007 Binary File Format (.doc)
* Specification; Page 24 of 210
*/
if (_bookmarksTables != null) {
_bookmarksTables.writePlcfBkmkl(_fib, tableStream);
tableOffset = tableStream.getOffset();
}
/*
* plcfbteChpx (bin table for CHP FKPs) Written immediately after the
* previously recorded table. This is recorded in all Word documents.
*
* Microsoft Office Word 97-2007 Binary File Format (.doc)
* Specification; Page 24 of 210
*/
// write out the CHPBinTable.
_fib.setFcPlcfbteChpx(tableOffset);
_cbt.writeTo(wordDocumentStream, tableStream, fcMin, _cft.getTextPieceTable());
_fib.setLcbPlcfbteChpx(tableStream.getOffset() - tableOffset);
tableOffset = tableStream.getOffset();
/*
* plcfbtePapx (bin table for PAP FKPs) Written immediately after the
* plcfbteChpx. This is recorded in all Word documents.
*
* Microsoft Office Word 97-2007 Binary File Format (.doc)
* Specification; Page 24 of 210
*/
// write out the PAPBinTable.
_fib.setFcPlcfbtePapx(tableOffset);
_pbt.writeTo(wordDocumentStream, tableStream, _cft.getTextPieceTable());
_fib.setLcbPlcfbtePapx(tableStream.getOffset() - tableOffset);
tableOffset = tableStream.getOffset();
/*
* plcfendRef (endnote reference position table) Written immediately
* after the previously recorded table if the document contains endnotes
*
* plcfendTxt (endnote text position table) Written immediately after
* the plcfendRef if the document contains endnotes
*
* Microsoft Office Word 97-2007 Binary File Format (.doc)
* Specification; Page 24 of 210
*/
_endnotesTables.writeRef(_fib, tableStream);
_endnotesTables.writeTxt(_fib, tableStream);
tableOffset = tableStream.getOffset();
if (_fieldsTables != null) {
_fieldsTables.write(_fib, tableStream);
tableOffset = tableStream.getOffset();
}
/*
* plcffndRef (footnote reference position table) Written immediately
* after the stsh if the document contains footnotes
*
* plcffndTxt (footnote text position table) Written immediately after
* the plcffndRef if the document contains footnotes
*
* Microsoft Office Word 97-2007 Binary File Format (.doc)
* Specification; Page 24 of 210
*/
_footnotesTables.writeRef(_fib, tableStream);
_footnotesTables.writeTxt(_fib, tableStream);
tableOffset = tableStream.getOffset();
/*
* plcfsed (section table) Written immediately after the previously
* recorded table. Recorded in all Word documents
*
* Microsoft Office Word 97-2007 Binary File Format (.doc)
* Specification; Page 25 of 210
*/
// write out the SectionTable.
_fib.setFcPlcfsed(tableOffset);
_st.writeTo(wordDocumentStream, tableStream);
_fib.setLcbPlcfsed(tableStream.getOffset() - tableOffset);
tableOffset = tableStream.getOffset();
// write out the list tables
if (_lt != null) {
/*
* plcflst (list formats) Written immediately after the end of the
* previously recorded, if there are any lists defined in the
* document. This begins with a short count of LSTF structures
* followed by those LSTF structures. This is immediately followed
* by the allocated data hanging off the LSTFs. This data consists
* of the array of LVLs for each LSTF. (Each LVL consists of an LVLF
* followed by two grpprls and an XST.)
*
* Microsoft Office Word 97-2007 Binary File Format (.doc)
* Specification; Page 25 of 210
*/
_lt.writeListDataTo(_fib, tableStream);
tableOffset = tableStream.getOffset();
/*
* plflfo (more list formats) Written immediately after the end of
* the plcflst and its accompanying data, if there are any lists
* defined in the document. This consists first of a PL of LFO
* records, followed by the allocated data (if any) hanging off the
* LFOs. The allocated data consists of the array of LFOLVLFs for
* each LFO (and each LFOLVLF is immediately followed by some LVLs).
*
* Microsoft Office Word 97-2007 Binary File Format (.doc)
* Specification; Page 26 of 210
*/
_lt.writeListOverridesTo(_fib, tableStream);
tableOffset = tableStream.getOffset();
}
/*
* sttbfBkmk (table of bookmark name strings) Written immediately after
* the previously recorded table, if the document contains bookmarks.
*
* Microsoft Office Word 97-2007 Binary File Format (.doc)
* Specification; Page 27 of 210
*/
if (_bookmarksTables != null) {
_bookmarksTables.writeSttbfBkmk(_fib, tableStream);
tableOffset = tableStream.getOffset();
}
// write out the saved-by table.
if (_sbt != null) {
_fib.setFcSttbSavedBy(tableOffset);
_sbt.writeTo(tableStream);
_fib.setLcbSttbSavedBy(tableStream.getOffset() - tableOffset);
tableOffset = tableStream.getOffset();
}
// write out the revision mark authors table.
if (_rmat != null) {
_fib.setFcSttbfRMark(tableOffset);
_rmat.writeTo(tableStream);
_fib.setLcbSttbfRMark(tableStream.getOffset() - tableOffset);
tableOffset = tableStream.getOffset();
}
// write out the FontTable.
_fib.setFcSttbfffn(tableOffset);
_ft.writeTo(tableStream);
_fib.setLcbSttbfffn(tableStream.getOffset() - tableOffset);
tableOffset = tableStream.getOffset();
// set some variables in the FileInformationBlock.
_fib.getFibBase().setFcMin(fcMin);
_fib.getFibBase().setFcMac(fcMac);
_fib.setCbMac(wordDocumentStream.getOffset());
// make sure that the table, doc and data streams use big blocks.
byte[] mainBuf = wordDocumentStream.toByteArray();
if (mainBuf.length < 4096) {
byte[] tempBuf = new byte[4096];
System.arraycopy(mainBuf, 0, tempBuf, 0, mainBuf.length);
mainBuf = tempBuf;
}
// Table1 stream will be used
_fib.getFibBase().setFWhichTblStm(true);
// write out the FileInformationBlock.
//_fib.serialize(mainBuf, 0);
_fib.writeTo(mainBuf, tableStream);
byte[] tableBuf = tableStream.toByteArray();
if (tableBuf.length < 4096) {
byte[] tempBuf = new byte[4096];
System.arraycopy(tableBuf, 0, tempBuf, 0, tableBuf.length);
tableBuf = tempBuf;
}
byte[] dataBuf = _dataStream;
if (dataBuf == null) {
dataBuf = new byte[4096];
}
if (dataBuf.length < 4096) {
byte[] tempBuf = new byte[4096];
System.arraycopy(dataBuf, 0, tempBuf, 0, dataBuf.length);
dataBuf = tempBuf;
}
// Create a new document preserving order of entries / Update existing
boolean docWritten = false;
boolean dataWritten = false;
boolean objectPoolWritten = false;
boolean tableWritten = false;
boolean propertiesWritten = false;
for (Entry entry : getDirectory()) {
if (entry.getName().equals(STREAM_WORD_DOCUMENT)) {
if (!docWritten) {
write(pfs, mainBuf, STREAM_WORD_DOCUMENT);
docWritten = true;
}
} else if (entry.getName().equals(STREAM_OBJECT_POOL)) {
if (!objectPoolWritten) {
if (copyOtherEntries) {
_objectPool.writeTo(pfs.getRoot());
} else {
// Object pool is already there, no need to change/copy
}
objectPoolWritten = true;
}
} else if (entry.getName().equals(STREAM_TABLE_0) || entry.getName().equals(STREAM_TABLE_1)) {
if (!tableWritten) {
write(pfs, tableBuf, STREAM_TABLE_1);
tableWritten = true;
}
} else if (entry.getName().equals(SummaryInformation.DEFAULT_STREAM_NAME) || entry.getName().equals(DocumentSummaryInformation.DEFAULT_STREAM_NAME)) {
if (!propertiesWritten) {
writeProperties(pfs);
propertiesWritten = true;
}
} else if (entry.getName().equals(STREAM_DATA)) {
if (!dataWritten) {
write(pfs, dataBuf, STREAM_DATA);
dataWritten = true;
}
} else if (copyOtherEntries) {
EntryUtils.copyNodeRecursively(entry, pfs.getRoot());
}
}
if (!docWritten)
write(pfs, mainBuf, STREAM_WORD_DOCUMENT);
if (!tableWritten)
write(pfs, tableBuf, STREAM_TABLE_1);
if (!propertiesWritten)
writeProperties(pfs);
if (!dataWritten)
write(pfs, dataBuf, STREAM_DATA);
if (!objectPoolWritten && copyOtherEntries)
_objectPool.writeTo(pfs.getRoot());
/*
* since we updated all references in FIB and etc, using new arrays to
* access data
*/
replaceDirectory(pfs.getRoot());
this._tableStream = tableStream.toByteArray();
this._dataStream = dataBuf;
}
use of org.apache.poi.poifs.filesystem.Entry in project poi by apache.
the class HWPFLister method dumpFileSystem.
private String dumpFileSystem(DirectoryEntry directory) {
StringBuilder result = new StringBuilder();
result.append("+ ");
result.append(directory.getName());
for (Iterator<Entry> iterator = directory.getEntries(); iterator.hasNext(); ) {
Entry entry = iterator.next();
String entryToString = "\n" + dumpFileSystem(entry);
entryToString = entryToString.replaceAll("\n", "\n+---");
result.append(entryToString);
}
result.append("\n");
return result.toString();
}
use of org.apache.poi.poifs.filesystem.Entry in project poi by apache.
the class TestWordExtractor method testBug51686.
/**
* [RESOLVED FIXED] Bug 51686 - Update to POI 3.8 beta 4 causes
* ConcurrentModificationException in Tika's OfficeParser
*/
@Test
public void testBug51686() throws IOException {
InputStream is = docTests.openResourceAsStream("Bug51686.doc");
POIFSFileSystem fs = new POIFSFileSystem(is);
is.close();
String text = null;
for (Entry entry : fs.getRoot()) {
if ("WordDocument".equals(entry.getName())) {
WordExtractor ex = new WordExtractor(fs);
try {
text = ex.getText();
} finally {
ex.close();
}
}
}
assertNotNull(text);
fs.close();
}
use of org.apache.poi.poifs.filesystem.Entry in project poi by apache.
the class TestEncryptor method listDir.
@SuppressWarnings("unused")
private void listDir(DirectoryNode dn, String ext, String path) throws IOException {
path += "\\" + dn.getName().replace('', '_');
System.out.println(ext + ": " + path + " (" + dn.getStorageClsid() + ")");
Iterator<Entry> iter = dn.getEntries();
while (iter.hasNext()) {
Entry ent = iter.next();
if (ent instanceof DirectoryNode) {
listDir((DirectoryNode) ent, ext, path);
} else {
listEntry((DocumentNode) ent, ext, path);
}
}
}
use of org.apache.poi.poifs.filesystem.Entry in project tika by apache.
the class RTFObjDataParser method handleEmbeddedPOIFS.
//will throw IOException if not actually POIFS
//can return null byte[]
private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException {
byte[] ret = null;
try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {
DirectoryNode root = fs.getRoot();
if (root == null) {
return ret;
}
if (root.hasEntry("Package")) {
Entry ooxml = root.getEntry("Package");
TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));
ByteArrayOutputStream out = new ByteArrayOutputStream();
IOUtils.copy(stream, out);
ret = out.toByteArray();
} else {
//try poifs
POIFSDocumentType type = POIFSDocumentType.detectType(root);
if (type == POIFSDocumentType.OLE10_NATIVE) {
try {
// Try to un-wrap the OLE10Native record:
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
ret = ole.getDataBuffer();
} catch (Ole10NativeException ex) {
// Not a valid OLE10Native record, skip it
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
DocumentEntry contentsEntry;
try {
contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
} catch (FileNotFoundException ioe) {
contentsEntry = (DocumentEntry) root.getEntry("Contents");
}
try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
ret = new byte[contentsEntry.getSize()];
inp.readFully(ret);
}
} else {
ByteArrayOutputStream out = new ByteArrayOutputStream();
is.reset();
IOUtils.copy(is, out);
ret = out.toByteArray();
metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
}
}
}
return ret;
}
Aggregations