use of org.apache.commons.io.IOExceptionWithCause in project tika by apache.
the class AbstractPDF2XHTML method endDocument.
@Override
protected void endDocument(PDDocument pdf) throws IOException {
try {
// Extract text for any bookmarks:
if (config.getExtractBookmarksText()) {
extractBookmarkText();
}
try {
extractEmbeddedDocuments(pdf);
} catch (IOException e) {
handleCatchableIOE(e);
}
//extract acroform data at end of doc
if (config.getExtractAcroFormContent() == true) {
try {
extractAcroForm(pdf);
} catch (IOException e) {
handleCatchableIOE(e);
}
}
PDDocumentCatalogAdditionalActions additionalActions = pdf.getDocumentCatalog().getActions();
handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT);
handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE);
handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE);
handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT);
handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE);
xhtml.endDocument();
} catch (TikaException e) {
throw new IOExceptionWithCause("Unable to end a document", e);
} catch (SAXException e) {
throw new IOExceptionWithCause("Unable to end a document", e);
}
}
use of org.apache.commons.io.IOExceptionWithCause in project tika by apache.
the class AbstractPDF2XHTML method doOCROnCurrentPage.
void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
if (config.getOcrStrategy().equals(NO_OCR)) {
return;
}
TesseractOCRConfig tesseractConfig = context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);
TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
if (!tesseractOCRParser.hasTesseract(tesseractConfig)) {
throw new TikaException("Tesseract is not available. " + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
}
PDFRenderer renderer = new PDFRenderer(pdDocument);
TemporaryResources tmp = new TemporaryResources();
try {
BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOcrImageType());
Path tmpFile = tmp.createTempFile();
try (OutputStream os = Files.newOutputStream(tmpFile)) {
//TODO: get output format from TesseractConfig
ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, config.getOcrDPI(), config.getOcrImageQuality());
}
try (InputStream is = TikaInputStream.get(tmpFile)) {
tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
}
} catch (IOException e) {
handleCatchableIOE(e);
} catch (SAXException e) {
throw new IOExceptionWithCause("error writing OCR content from PDF", e);
} finally {
tmp.dispose();
}
}
use of org.apache.commons.io.IOExceptionWithCause in project tika by apache.
the class AbstractPDF2XHTML method endPage.
@Override
protected void endPage(PDPage page) throws IOException {
try {
for (PDAnnotation annotation : page.getAnnotations()) {
if (annotation instanceof PDAnnotationFileAttachment) {
PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
try {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "source", "source", "CDATA", "annotation");
extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
} catch (SAXException e) {
throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
} catch (TikaException e) {
throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
} catch (IOException e) {
handleCatchableIOE(e);
}
} else if (annotation instanceof PDAnnotationWidget) {
handleWidget((PDAnnotationWidget) annotation);
}
// TODO: remove once PDFBOX-1143 is fixed:
if (config.getExtractAnnotationText()) {
PDActionURI uri = getActionURI(annotation);
if (uri != null) {
String link = uri.getURI();
if (link != null && link.trim().length() > 0) {
xhtml.startElement("div", "class", "annotation");
xhtml.startElement("a", "href", link);
xhtml.characters(link);
xhtml.endElement("a");
xhtml.endElement("div");
}
}
if (annotation instanceof PDAnnotationMarkup) {
PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
String title = annotationMarkup.getTitlePopup();
String subject = annotationMarkup.getSubject();
String contents = annotationMarkup.getContents();
// TODO: maybe also annotationMarkup.getRichContents()?
if (title != null || subject != null || contents != null) {
xhtml.startElement("div", "class", "annotation");
if (title != null) {
xhtml.startElement("div", "class", "annotationTitle");
xhtml.characters(title);
xhtml.endElement("div");
}
if (subject != null) {
xhtml.startElement("div", "class", "annotationSubject");
xhtml.characters(subject);
xhtml.endElement("div");
}
if (contents != null) {
xhtml.startElement("div", "class", "annotationContents");
xhtml.characters(contents);
xhtml.endElement("div");
}
xhtml.endElement("div");
}
}
}
}
if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
doOCROnCurrentPage();
}
PDPageAdditionalActions pageActions = page.getActions();
if (pageActions != null) {
handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
}
xhtml.endElement("div");
} catch (SAXException | TikaException e) {
throw new IOExceptionWithCause("Unable to end a page", e);
} catch (IOException e) {
exceptions.add(e);
} finally {
pageIndex++;
}
}
use of org.apache.commons.io.IOExceptionWithCause in project tika by apache.
the class SQLite3DBParser method getConnection.
@Override
protected Connection getConnection(InputStream stream, Metadata metadata, ParseContext context) throws IOException {
String connectionString = getConnectionString(stream, metadata, context);
Connection connection = null;
try {
Class.forName(getJDBCClassName());
} catch (ClassNotFoundException e) {
throw new IOExceptionWithCause(e);
}
try {
SQLiteConfig config = new SQLiteConfig();
//good habit, but effectively meaningless here
config.setReadOnly(true);
connection = config.createConnection(connectionString);
} catch (SQLException e) {
throw new IOException(e.getMessage());
}
return connection;
}
use of org.apache.commons.io.IOExceptionWithCause in project opennms by OpenNMS.
the class VmwareRequisitionUrlConnection method getInputStream.
/**
* {@inheritDoc}
* <p/>
* Creates a ByteArrayInputStream implementation of InputStream of the XML
* marshaled version of the Requisition class. Calling close on this stream
* is safe.
*/
@Override
public InputStream getInputStream() throws IOException {
InputStream stream = null;
try {
final Requisition existingRequisition = getExistingRequisition(importRequest.getForeignSource());
importRequest.setExistingRequisition(existingRequisition);
final VmwareImporter importer = new VmwareImporter(importRequest);
stream = new ByteArrayInputStream(jaxBMarshal(importer.getRequisition()).getBytes());
} catch (Throwable e) {
logger.warn("Problem getting input stream: '{}'", e);
throw new IOExceptionWithCause("Problem getting input stream: " + e, e);
}
return stream;
}
Aggregations