use of org.apache.poi.poifs.filesystem.DocumentEntry in project poi by apache.
the class HSLFSlideShowImpl method readPictures.
/**
* Find and read in pictures contained in this presentation.
* This is lazily called as and when we want to touch pictures.
*/
private void readPictures() throws IOException {
_pictures = new ArrayList<HSLFPictureData>();
// if the presentation doesn't contain pictures - will use a null set instead
if (!getDirectory().hasEntry("Pictures")) {
return;
}
DocumentEntry entry = (DocumentEntry) getDirectory().getEntry("Pictures");
DocumentInputStream is = getDirectory().createDocumentInputStream(entry);
byte[] pictstream = IOUtils.toByteArray(is, entry.getSize());
is.close();
HSLFSlideShowEncrypted decryptData = new HSLFSlideShowEncrypted(getDocumentEncryptionAtom());
try {
int pos = 0;
// An empty picture record (length 0) will take up 8 bytes
while (pos <= (pictstream.length - 8)) {
int offset = pos;
decryptData.decryptPicture(pictstream, offset);
// Image signature
int signature = LittleEndian.getUShort(pictstream, pos);
pos += LittleEndianConsts.SHORT_SIZE;
// Image type + 0xF018
int type = LittleEndian.getUShort(pictstream, pos);
pos += LittleEndianConsts.SHORT_SIZE;
// Image size (excluding the 8 byte header)
int imgsize = LittleEndian.getInt(pictstream, pos);
pos += LittleEndianConsts.INT_SIZE;
// should terminate if the type isn't 0xf007 or 0xf018->0xf117
if (!((type == 0xf007) || (type >= 0xf018 && type <= 0xf117))) {
break;
}
// time, so we won't get stuck)
if (imgsize < 0) {
throw new CorruptPowerPointFileException("The file contains a picture, at position " + _pictures.size() + ", which has a negatively sized data length, so we can't trust any of the picture data");
}
// If they type (including the bonus 0xF018) is 0, skip it
PictureType pt = PictureType.forNativeID(type - 0xF018);
if (pt == null) {
logger.log(POILogger.ERROR, "Problem reading picture: Invalid image type 0, on picture with length " + imgsize + ".\nYou document will probably become corrupted if you save it!");
logger.log(POILogger.ERROR, "" + pos);
} else {
//that are not used in any slide -- BUG-60305
if (pos + imgsize > pictstream.length) {
logger.log(POILogger.WARN, "\"Pictures\" stream may have ended early. In some circumstances, this is not a problem; " + "in others, this could indicate a corrupt file");
break;
}
// Build the PictureData object from the data
try {
HSLFPictureData pict = HSLFPictureData.create(pt);
pict.setSignature(signature);
// Copy the data, ready to pass to PictureData
byte[] imgdata = new byte[imgsize];
System.arraycopy(pictstream, pos, imgdata, 0, imgdata.length);
pict.setRawData(imgdata);
pict.setOffset(offset);
pict.setIndex(_pictures.size());
_pictures.add(pict);
} catch (IllegalArgumentException e) {
logger.log(POILogger.ERROR, "Problem reading picture: " + e + "\nYou document will probably become corrupted if you save it!");
}
}
pos += imgsize;
}
} finally {
decryptData.close();
}
}
use of org.apache.poi.poifs.filesystem.DocumentEntry in project poi by apache.
the class HWPFDocFixture method setUp.
public void setUp() throws IOException {
POIFSFileSystem filesystem = new POIFSFileSystem(POIDataSamples.getDocumentInstance().openResourceAsStream(_testFile));
DocumentEntry documentProps = (DocumentEntry) filesystem.getRoot().getEntry("WordDocument");
_mainStream = new byte[documentProps.getSize()];
filesystem.createDocumentInputStream("WordDocument").read(_mainStream);
// use the fib to determine the name of the table stream.
_fib = new FileInformationBlock(_mainStream);
String name = "0Table";
if (_fib.getFibBase().isFWhichTblStm()) {
name = "1Table";
}
// read in the table stream.
DocumentEntry tableProps = (DocumentEntry) filesystem.getRoot().getEntry(name);
_tableStream = new byte[tableProps.getSize()];
filesystem.createDocumentInputStream(name).read(_tableStream);
_fib.fillVariableFields(_mainStream, _tableStream);
}
use of org.apache.poi.poifs.filesystem.DocumentEntry in project tika by apache.
the class AbstractPOIFSExtractor method handleEmbeddedOfficeDoc.
/**
* Handle an office document that's embedded at the POIFS level
*/
protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
if (dir.hasEntry("Package")) {
// It's OOXML (has a ZipFile):
Entry ooxml = dir.getEntry("Package");
try (TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml))) {
ZipContainerDetector detector = new ZipContainerDetector();
MediaType type = null;
try {
//if there's a stream error while detecting...
type = detector.detect(stream, new Metadata());
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true);
return;
}
}
// It's regular OLE2:
// What kind of document is it?
Metadata metadata = new Metadata();
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
if (dir.getStorageClsid() != null) {
metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, dir.getStorageClsid().toString());
}
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
TikaInputStream embedded = null;
String rName = (resourceName == null) ? dir.getName() : resourceName;
try {
if (type == POIFSDocumentType.OLE10_NATIVE) {
try {
// Try to un-wrap the OLE10Native record:
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
if (ole.getLabel() != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
}
if (ole.getCommand() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
}
if (ole.getFileName() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
}
byte[] data = ole.getDataBuffer();
embedded = TikaInputStream.get(data);
} catch (Ole10NativeException ex) {
// Not a valid OLE10Native record, skip it
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
try {
//TODO: figure out if the equivalent of OLE 1.0's
//getCommand() and getFileName() exist for OLE 2.0 to populate
//TikaCoreProperties.ORIGINAL_RESOURCE_NAME
// Grab the contents and process
DocumentEntry contentsEntry;
try {
contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
} catch (FileNotFoundException ioe) {
contentsEntry = (DocumentEntry) dir.getEntry("Contents");
}
DocumentInputStream inp = new DocumentInputStream(contentsEntry);
byte[] contents = new byte[contentsEntry.getSize()];
inp.readFully(contents);
embedded = TikaInputStream.get(contents);
// Try to work out what it is
MediaType mediaType = getDetector().detect(embedded, new Metadata());
String extension = type.getExtension();
try {
MimeType mimeType = getMimeTypes().forName(mediaType.toString());
extension = mimeType.getExtension();
} catch (MimeTypeException mte) {
// No details on this type are known
}
// Record what we can do about it
metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
} else {
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '.' + type.getExtension());
}
// Should we parse it?
if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
if (embedded == null) {
// Make a TikaInputStream that just
// passes the root directory of the
// embedded document, and is otherwise
// empty (byte[0]):
embedded = TikaInputStream.get(new byte[0]);
embedded.setOpenContainer(dir);
}
embeddedDocumentUtil.parseEmbedded(embedded, xhtml, metadata, true);
}
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
} finally {
if (embedded != null) {
embedded.close();
}
}
}
use of org.apache.poi.poifs.filesystem.DocumentEntry in project tika by apache.
the class SummaryExtractor method parseSummaryEntryIfExists.
private void parseSummaryEntryIfExists(DirectoryNode root, String entryName) throws IOException, TikaException {
try {
DocumentEntry entry = (DocumentEntry) root.getEntry(entryName);
PropertySet properties = new PropertySet(new DocumentInputStream(entry));
if (properties.isSummaryInformation()) {
parse(new SummaryInformation(properties));
}
if (properties.isDocumentSummaryInformation()) {
parse(new DocumentSummaryInformation(properties));
}
} catch (FileNotFoundException e) {
// entry does not exist, just skip it
} catch (NoPropertySetStreamException e) {
// no property stream, just skip it
} catch (UnexpectedPropertySetTypeException e) {
throw new TikaException("Unexpected HPSF document", e);
} catch (MarkUnsupportedException e) {
throw new TikaException("Invalid DocumentInputStream", e);
} catch (Exception e) {
LOG.warn("Ignoring unexpected exception while parsing summary entry {}", entryName, e);
}
}
Aggregations