use of org.apache.tika.exception.TikaException in project tika by apache.
the class XSSFExcelExtractorDecorator method getMainDocumentParts.
/**
* In Excel files, sheets have things embedded in them,
* and sheet drawings which have the images
*/
@Override
protected List<PackagePart> getMainDocumentParts() throws TikaException {
List<PackagePart> parts = new ArrayList<PackagePart>();
for (PackagePart part : sheetParts) {
// Add the sheet
parts.add(part);
// If it has drawings, return those too
try {
for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
parts.add(rel.getPackage().getPart(relName));
}
}
for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
parts.add(rel.getPackage().getPart(relName));
}
}
} catch (InvalidFormatException e) {
throw new TikaException("Broken OOXML file", e);
}
}
//by AbstractOOXMLExtractor
for (PackagePart part : extractor.getPackage().getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
parts.add(part);
}
return parts;
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class OCR2XHTML method process.
/**
* Converts the given PDF document (and related metadata) to a stream
* of XHTML SAX events sent to the given content handler.
*
* @param document PDF document
* @param handler SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
public static void process(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws SAXException, TikaException {
OCR2XHTML ocr2XHTML = null;
try {
ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config);
ocr2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
}
@Override
public void flush() {
}
@Override
public void close() {
}
});
} catch (IOException e) {
if (e.getCause() instanceof SAXException) {
throw (SAXException) e.getCause();
} else {
throw new TikaException("Unable to extract PDF content", e);
}
}
if (ocr2XHTML.exceptions.size() > 0) {
//throw the first
throw new TikaException("Unable to extract all PDF content", ocr2XHTML.exceptions.get(0));
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class PDF2XHTML method process.
/**
* Converts the given PDF document (and related metadata) to a stream
* of XHTML SAX events sent to the given content handler.
*
* @param document PDF document
* @param handler SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
public static void process(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws SAXException, TikaException {
PDF2XHTML pdf2XHTML = null;
try {
// Extract text using a dummy Writer as we override the
// key methods to output to the given content
// handler.
pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config);
config.configure(pdf2XHTML);
pdf2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
}
@Override
public void flush() {
}
@Override
public void close() {
}
});
} catch (IOException e) {
if (e.getCause() instanceof SAXException) {
throw (SAXException) e.getCause();
} else {
throw new TikaException("Unable to extract PDF content", e);
}
}
if (pdf2XHTML.exceptions.size() > 0) {
//throw the first
throw new TikaException("Unable to extract PDF content", pdf2XHTML.exceptions.get(0));
}
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class PDFParser method loadDOM.
//can return null!
private Document loadDOM(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
if (pdMetadata == null) {
return null;
}
InputStream is = null;
try {
try {
is = pdMetadata.exportXMPMetadata();
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
return null;
}
DocumentBuilder documentBuilder = context.getDocumentBuilder();
documentBuilder.setErrorHandler((ErrorHandler) null);
return documentBuilder.parse(is);
} catch (IOException | SAXException | TikaException e) {
EmbeddedDocumentUtil.recordException(e, metadata);
} finally {
IOUtils.closeQuietly(is);
}
return null;
}
use of org.apache.tika.exception.TikaException in project tika by apache.
the class RarParser method parse.
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Archive rar = null;
try (TemporaryResources tmp = new TemporaryResources()) {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
rar = new Archive(tis.getFile());
if (rar.isEncrypted()) {
throw new EncryptedDocumentException();
}
//Without this BodyContentHandler does not work
xhtml.element("div", " ");
FileHeader header = rar.nextFileHeader();
while (header != null && !Thread.currentThread().isInterrupted()) {
if (!header.isDirectory()) {
try (InputStream subFile = rar.getInputStream(header)) {
Metadata entrydata = PackageParser.handleEntryMetadata("".equals(header.getFileNameW()) ? header.getFileNameString() : header.getFileNameW(), header.getCTime(), header.getMTime(), header.getFullUnpackSize(), xhtml);
if (extractor.shouldParseEmbedded(entrydata)) {
extractor.parseEmbedded(subFile, handler, entrydata, true);
}
}
}
header = rar.nextFileHeader();
}
} catch (RarException e) {
throw new TikaException("RarParser Exception", e);
} finally {
if (rar != null)
rar.close();
}
xhtml.endDocument();
}
Aggregations