use of org.apache.poi.hssf.extractor.OldExcelExtractor in project tika by apache.
the class OldExcelParser method parse.
/**
* Extracts properties and text from an MS Document input stream
*/
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Open the POI provided extractor
OldExcelExtractor extractor = new OldExcelExtractor(stream);
// We can't do anything about metadata, as these old formats
// didn't have any stored with them
// Set the content type
// TODO Get the version and type, to set as the Content Type
// Have the text extracted and given to our Content Handler
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
parse(extractor, xhtml);
}
use of org.apache.poi.hssf.extractor.OldExcelExtractor in project tika by apache.
the class ExcelExtractor method parse.
protected void parse(DirectoryNode root, XHTMLContentHandler xhtml, Locale locale) throws IOException, SAXException, TikaException {
if (!root.hasEntry(WORKBOOK_ENTRY)) {
if (root.hasEntry(BOOK_ENTRY)) {
// Excel 5 / Excel 95 file
// Records are in a different structure so needs a
// different parser to process them
OldExcelExtractor extractor = new OldExcelExtractor(root);
OldExcelParser.parse(extractor, xhtml);
return;
} else {
// Corrupt file / very old file, just skip text extraction
return;
}
}
// If a password was supplied, use it, otherwise the default
Biff8EncryptionKey.setCurrentUserPassword(getPassword());
// Have the file processed in event mode
TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
listener.processFile(root, isListenForAllRecords());
listener.throwStoredException();
for (Entry entry : root) {
if (entry.getName().startsWith("MBD") && entry instanceof DirectoryEntry) {
try {
handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
} catch (TikaException e) {
// ignore parse errors from embedded documents
}
}
}
}
Aggregations