use of org.apache.tika.exception.UnsupportedFormatException in project tika by apache.
the class QPWTextExtractor method extract.
@SuppressWarnings("resource")
public void extract(InputStream input, XHTMLContentHandler xhtml, Metadata metadata) throws IOException, SAXException, TikaException {
POIFSFileSystem pfs = new POIFSFileSystem(input);
DirectoryNode rootNode = pfs.getRoot();
if (rootNode == null || !rootNode.hasEntry(OLE_DOCUMENT_NAME)) {
throw new UnsupportedFormatException("Unsupported QuattroPro file format. " + "Looking for OLE entry \"" + OLE_DOCUMENT_NAME + "\". Found: " + (rootNode == null ? "null" : rootNode.getEntryNames()));
}
//TODO shall we validate and throw warning/error if the file does not
//start with a BOF and ends with a EOF?
xhtml.startElement("p");
try (WPInputStream in = new WPInputStream(pfs.createDocumentInputStream(OLE_DOCUMENT_NAME))) {
Context ctx = new Context(in, xhtml, metadata);
while (hasNext(in)) {
ctx.type = in.readWPShort();
ctx.bodyLength = in.readWPShort();
Extractor extractor = EXTRACTORS.get(ctx.type);
if (extractor != null) {
extractor.extract(ctx);
} else {
// Use DEBUG to find out what we are ignoring
// Extractor.DEBUG.extract(ctx);
Extractor.IGNORE.extract(ctx);
}
}
}
xhtml.endElement("p");
}
Aggregations