use of org.apache.poi.poifs.filesystem.DirectoryNode in project poi by apache.
the class TestEncryptor method listDir.
@SuppressWarnings("unused")
private void listDir(DirectoryNode dn, String ext, String path) throws IOException {
path += "\\" + dn.getName().replace('', '_');
System.out.println(ext + ": " + path + " (" + dn.getStorageClsid() + ")");
Iterator<Entry> iter = dn.getEntries();
while (iter.hasNext()) {
Entry ent = iter.next();
if (ent instanceof DirectoryNode) {
listDir((DirectoryNode) ent, ext, path);
} else {
listEntry((DocumentNode) ent, ext, path);
}
}
}
use of org.apache.poi.poifs.filesystem.DirectoryNode in project tika by apache.
the class QPWTextExtractor method extract.
@SuppressWarnings("resource")
public void extract(InputStream input, XHTMLContentHandler xhtml, Metadata metadata) throws IOException, SAXException, TikaException {
POIFSFileSystem pfs = new POIFSFileSystem(input);
DirectoryNode rootNode = pfs.getRoot();
if (rootNode == null || !rootNode.hasEntry(OLE_DOCUMENT_NAME)) {
throw new UnsupportedFormatException("Unsupported QuattroPro file format. " + "Looking for OLE entry \"" + OLE_DOCUMENT_NAME + "\". Found: " + (rootNode == null ? "null" : rootNode.getEntryNames()));
}
//TODO shall we validate and throw warning/error if the file does not
//start with a BOF and ends with a EOF?
xhtml.startElement("p");
try (WPInputStream in = new WPInputStream(pfs.createDocumentInputStream(OLE_DOCUMENT_NAME))) {
Context ctx = new Context(in, xhtml, metadata);
while (hasNext(in)) {
ctx.type = in.readWPShort();
ctx.bodyLength = in.readWPShort();
Extractor extractor = EXTRACTORS.get(ctx.type);
if (extractor != null) {
extractor.extract(ctx);
} else {
// Use DEBUG to find out what we are ignoring
// Extractor.DEBUG.extract(ctx);
Extractor.IGNORE.extract(ctx);
}
}
}
xhtml.endElement("p");
}
use of org.apache.poi.poifs.filesystem.DirectoryNode in project tika by apache.
the class RTFObjDataParser method handleEmbeddedPOIFS.
//will throw IOException if not actually POIFS
//can return null byte[]
private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException {
byte[] ret = null;
try (NPOIFSFileSystem fs = new NPOIFSFileSystem(is)) {
DirectoryNode root = fs.getRoot();
if (root == null) {
return ret;
}
if (root.hasEntry("Package")) {
Entry ooxml = root.getEntry("Package");
TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml));
ByteArrayOutputStream out = new ByteArrayOutputStream();
IOUtils.copy(stream, out);
ret = out.toByteArray();
} else {
//try poifs
POIFSDocumentType type = POIFSDocumentType.detectType(root);
if (type == POIFSDocumentType.OLE10_NATIVE) {
try {
// Try to un-wrap the OLE10Native record:
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(root);
ret = ole.getDataBuffer();
} catch (Ole10NativeException ex) {
// Not a valid OLE10Native record, skip it
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
DocumentEntry contentsEntry;
try {
contentsEntry = (DocumentEntry) root.getEntry("CONTENTS");
} catch (FileNotFoundException ioe) {
contentsEntry = (DocumentEntry) root.getEntry("Contents");
}
try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
ret = new byte[contentsEntry.getSize()];
inp.readFully(ret);
}
} else {
ByteArrayOutputStream out = new ByteArrayOutputStream();
is.reset();
IOUtils.copy(is, out);
ret = out.toByteArray();
metadata.set(Metadata.RESOURCE_NAME_KEY, "file_" + unknownFilenameCount.getAndIncrement() + "." + type.getExtension());
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
}
}
}
return ret;
}
use of org.apache.poi.poifs.filesystem.DirectoryNode in project tika by apache.
the class POIFSContainerDetector method detect.
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
// Check if we have access to the document
if (input == null) {
return MediaType.OCTET_STREAM;
}
// If this is a TikaInputStream wrapping an already
// parsed NPOIFileSystem/DirectoryNode, just get the
// names from the root:
TikaInputStream tis = TikaInputStream.cast(input);
Set<String> names = null;
if (tis != null) {
Object container = tis.getOpenContainer();
if (container instanceof NPOIFSFileSystem) {
names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
} else if (container instanceof DirectoryNode) {
names = getTopLevelNames((DirectoryNode) container);
}
}
if (names == null) {
// Check if the document starts with the OLE header
input.mark(8);
try {
if (input.read() != 0xd0 || input.read() != 0xcf || input.read() != 0x11 || input.read() != 0xe0 || input.read() != 0xa1 || input.read() != 0xb1 || input.read() != 0x1a || input.read() != 0xe1) {
return MediaType.OCTET_STREAM;
}
} finally {
input.reset();
}
}
// We can only detect the exact type when given a TikaInputStream
if (names == null && tis != null) {
// Look for known top level entry names to detect the document type
names = getTopLevelNames(tis);
}
// Detect based on the names (as available)
if (tis != null && tis.getOpenContainer() != null && tis.getOpenContainer() instanceof NPOIFSFileSystem) {
return detect(names, ((NPOIFSFileSystem) tis.getOpenContainer()).getRoot());
} else {
return detect(names, null);
}
}
use of org.apache.poi.poifs.filesystem.DirectoryNode in project tika by apache.
the class AbstractOOXMLExtractor method handleEmbeddedOLE.
/**
* Handles an embedded OLE object in the document
*/
private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, String rel, Metadata parentMetadata) throws IOException, SAXException {
// A POIFSFileSystem needs to be at least 3 blocks big to be valid
if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
// Too small, skip
return;
}
InputStream is = part.getInputStream();
// Open the POIFS (OLE2) structure and process
POIFSFileSystem fs = null;
try {
fs = new POIFSFileSystem(part.getInputStream());
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
TikaInputStream stream = null;
try {
Metadata metadata = new Metadata();
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, rel);
DirectoryNode root = fs.getRoot();
POIFSDocumentType type = POIFSDocumentType.detectType(root);
if (root.hasEntry("CONTENTS") && root.hasEntry("Ole") && root.hasEntry("CompObj")) {
// TIKA-704: OLE 2.0 embedded non-Office document?
//TODO: figure out if the equivalent of OLE 1.0's
//getCommand() and getFileName() exist for OLE 2.0 to populate
//TikaCoreProperties.ORIGINAL_RESOURCE_NAME
stream = TikaInputStream.get(fs.createDocumentInputStream("CONTENTS"));
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
}
} else if (POIFSDocumentType.OLE10_NATIVE == type) {
// TIKA-704: OLE 1.0 embedded document
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject(fs);
if (ole.getLabel() != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
}
if (ole.getCommand() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
}
if (ole.getFileName() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
}
byte[] data = ole.getDataBuffer();
if (data != null) {
stream = TikaInputStream.get(data);
}
if (stream != null && embeddedExtractor.shouldParseEmbedded(metadata)) {
embeddedExtractor.parseEmbedded(stream, new EmbeddedContentHandler(handler), metadata, false);
}
} else {
handleEmbeddedFile(part, handler, rel);
}
} catch (FileNotFoundException e) {
// There was no CONTENTS entry, so skip this part
} catch (Ole10NativeException e) {
// Could not process an OLE 1.0 entry, so skip this part
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
} finally {
if (fs != null) {
fs.close();
}
if (stream != null) {
stream.close();
}
}
}
Aggregations