use of org.apache.poi.poifs.filesystem.DirectoryEntry in project tika by apache.
the class AbstractPOIFSExtractor method handleEmbeddedOfficeDoc.
* Handle an office document that's embedded at the POIFS level
protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
if (dir.hasEntry("Package")) {
// It's OOXML (has a ZipFile):
Entry ooxml = dir.getEntry("Package");
try (TikaInputStream stream = TikaInputStream.get(new DocumentInputStream((DocumentEntry) ooxml))) {
ZipContainerDetector detector = new ZipContainerDetector();
MediaType type = null;
try {
//if there's a stream error while detecting...
type = detector.detect(stream, new Metadata());
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true);
// It's regular OLE2:
// What kind of document is it?
Metadata metadata = new Metadata();
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
if (dir.getStorageClsid() != null) {
metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, dir.getStorageClsid().toString());
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
TikaInputStream embedded = null;
String rName = (resourceName == null) ? dir.getName() : resourceName;
try {
if (type == POIFSDocumentType.OLE10_NATIVE) {
try {
// Try to un-wrap the OLE10Native record:
Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
if (ole.getLabel() != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
if (ole.getCommand() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
if (ole.getFileName() != null) {
metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
byte[] data = ole.getDataBuffer();
embedded = TikaInputStream.get(data);
} catch (Ole10NativeException ex) {
// Not a valid OLE10Native record, skip it
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
} else if (type == POIFSDocumentType.COMP_OBJ) {
try {
//TODO: figure out if the equivalent of OLE 1.0's
//getCommand() and getFileName() exist for OLE 2.0 to populate
// Grab the contents and process
DocumentEntry contentsEntry;
try {
contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
} catch (FileNotFoundException ioe) {
contentsEntry = (DocumentEntry) dir.getEntry("Contents");
DocumentInputStream inp = new DocumentInputStream(contentsEntry);
byte[] contents = new byte[contentsEntry.getSize()];
embedded = TikaInputStream.get(contents);
// Try to work out what it is
MediaType mediaType = getDetector().detect(embedded, new Metadata());
String extension = type.getExtension();
try {
MimeType mimeType = getMimeTypes().forName(mediaType.toString());
extension = mimeType.getExtension();
} catch (MimeTypeException mte) {
// No details on this type are known
// Record what we can do about it
metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
} catch (Exception e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
} else {
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '.' + type.getExtension());
// Should we parse it?
if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
if (embedded == null) {
// Make a TikaInputStream that just
// passes the root directory of the
// embedded document, and is otherwise
// empty (byte[0]):
embedded = TikaInputStream.get(new byte[0]);
embeddedDocumentUtil.parseEmbedded(embedded, xhtml, metadata, true);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
} finally {
if (embedded != null) {
use of org.apache.poi.poifs.filesystem.DirectoryEntry in project tika by apache.
the class WordExtractor method parse.
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
HWPFDocument document;
try {
document = new HWPFDocument(root);
} catch (org.apache.poi.EncryptedDocumentException e) {
throw new EncryptedDocumentException(e);
} catch (OldWordFileFormatException e) {
parseWord6(root, xhtml);
org.apache.poi.hwpf.extractor.WordExtractor wordExtractor = new org.apache.poi.hwpf.extractor.WordExtractor(document);
HeaderStories headerFooter = new HeaderStories(document);
// Grab the list of pictures. As far as we can tell,
// the pictures should be in order, and may be directly
// placed or referenced from an anchor
PicturesTable pictureTable = document.getPicturesTable();
PicturesSource pictures = new PicturesSource(document);
// Do any headers, if present
Range[] headers = new Range[] { headerFooter.getFirstHeaderSubrange(), headerFooter.getEvenHeaderSubrange(), headerFooter.getOddHeaderSubrange() };
handleHeaderFooter(headers, "header", document, pictures, pictureTable, xhtml);
// Do the main paragraph text
Range r = document.getRange();
ListManager listManager = new ListManager(document);
for (int i = 0; i < r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
i += handleParagraph(p, 0, r, document, FieldsDocumentPart.MAIN, pictures, pictureTable, listManager, xhtml);
if (officeParserConfig.getIncludeShapeBasedContent()) {
// Do everything else
for (String paragraph : wordExtractor.getMainTextboxText()) {
xhtml.element("p", paragraph);
for (String paragraph : wordExtractor.getFootnoteText()) {
xhtml.element("p", paragraph);
for (String paragraph : wordExtractor.getCommentsText()) {
xhtml.element("p", paragraph);
for (String paragraph : wordExtractor.getEndnoteText()) {
xhtml.element("p", paragraph);
// Do any footers, if present
Range[] footers = new Range[] { headerFooter.getFirstFooterSubrange(), headerFooter.getEvenFooterSubrange(), headerFooter.getOddFooterSubrange() };
handleHeaderFooter(footers, "footer", document, pictures, pictureTable, xhtml);
// Handle any pictures that we haven't output yet
for (Picture p = pictures.nextUnclaimed(); p != null; ) {
handlePictureCharacterRun(null, p, pictures, xhtml);
p = pictures.nextUnclaimed();
// Handle any embeded office documents
try {
DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
for (Entry entry : op) {
if (entry.getName().startsWith("_") && entry instanceof DirectoryEntry) {
handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
} catch (FileNotFoundException e) {
use of org.apache.poi.poifs.filesystem.DirectoryEntry in project tika by apache.
the class ExcelExtractor method parse.
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException {
if (!root.hasEntry(WORKBOOK_ENTRY)) {
if (root.hasEntry(BOOK_ENTRY)) {
// Excel 5 / Excel 95 file
// Records are in a different structure so needs a
// different parser to process them
OldExcelExtractor extractor = new OldExcelExtractor(root);
OldExcelParser.parse(extractor, xhtml);
} else {
// Corrupt file / very old file, just skip text extraction
// If a password was supplied, use it, otherwise the default
// Have the file processed in event mode
TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
listener.processFile(root, isListenForAllRecords());
for (Entry entry : root) {
if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) {
try {
handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
} catch (TikaException e) {
// ignore parse errors from embedded documents