Search in sources :

Example 1 with IOExceptionWithCause

use of org.apache.commons.io.IOExceptionWithCause in project tika by apache.

the class AbstractPDF2XHTML method endDocument.

@Override
protected void endDocument(PDDocument pdf) throws IOException {
    try {
        // Extract text for any bookmarks:
        if (config.getExtractBookmarksText()) {
            extractBookmarkText();
        }
        try {
            extractEmbeddedDocuments(pdf);
        } catch (IOException e) {
            handleCatchableIOE(e);
        }
        //extract acroform data at end of doc
        if (config.getExtractAcroFormContent() == true) {
            try {
                extractAcroForm(pdf);
            } catch (IOException e) {
                handleCatchableIOE(e);
            }
        }
        PDDocumentCatalogAdditionalActions additionalActions = pdf.getDocumentCatalog().getActions();
        handleDestinationOrAction(additionalActions.getDP(), ActionTrigger.AFTER_DOCUMENT_PRINT);
        handleDestinationOrAction(additionalActions.getDS(), ActionTrigger.AFTER_DOCUMENT_SAVE);
        handleDestinationOrAction(additionalActions.getWC(), ActionTrigger.BEFORE_DOCUMENT_CLOSE);
        handleDestinationOrAction(additionalActions.getWP(), ActionTrigger.BEFORE_DOCUMENT_PRINT);
        handleDestinationOrAction(additionalActions.getWS(), ActionTrigger.BEFORE_DOCUMENT_SAVE);
        xhtml.endDocument();
    } catch (TikaException e) {
        throw new IOExceptionWithCause("Unable to end a document", e);
    } catch (SAXException e) {
        throw new IOExceptionWithCause("Unable to end a document", e);
    }
}
Also used : IOExceptionWithCause(org.apache.commons.io.IOExceptionWithCause) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) PDDocumentCatalogAdditionalActions(org.apache.pdfbox.pdmodel.interactive.action.PDDocumentCatalogAdditionalActions) SAXException(org.xml.sax.SAXException)

Example 2 with IOExceptionWithCause

use of org.apache.commons.io.IOExceptionWithCause in project tika by apache.

the class AbstractPDF2XHTML method doOCROnCurrentPage.

void doOCROnCurrentPage() throws IOException, TikaException, SAXException {
    if (config.getOcrStrategy().equals(NO_OCR)) {
        return;
    }
    TesseractOCRConfig tesseractConfig = context.get(TesseractOCRConfig.class, DEFAULT_TESSERACT_CONFIG);
    TesseractOCRParser tesseractOCRParser = new TesseractOCRParser();
    if (!tesseractOCRParser.hasTesseract(tesseractConfig)) {
        throw new TikaException("Tesseract is not available. " + "Please set the OCR_STRATEGY to NO_OCR or configure Tesseract correctly");
    }
    PDFRenderer renderer = new PDFRenderer(pdDocument);
    TemporaryResources tmp = new TemporaryResources();
    try {
        BufferedImage image = renderer.renderImage(pageIndex, 2.0f, config.getOcrImageType());
        Path tmpFile = tmp.createTempFile();
        try (OutputStream os = Files.newOutputStream(tmpFile)) {
            //TODO: get output format from TesseractConfig
            ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, config.getOcrDPI(), config.getOcrImageQuality());
        }
        try (InputStream is = TikaInputStream.get(tmpFile)) {
            tesseractOCRParser.parseInline(is, xhtml, tesseractConfig);
        }
    } catch (IOException e) {
        handleCatchableIOE(e);
    } catch (SAXException e) {
        throw new IOExceptionWithCause("error writing OCR content from PDF", e);
    } finally {
        tmp.dispose();
    }
}
Also used : TesseractOCRConfig(org.apache.tika.parser.ocr.TesseractOCRConfig) Path(java.nio.file.Path) IOExceptionWithCause(org.apache.commons.io.IOExceptionWithCause) TikaException(org.apache.tika.exception.TikaException) BufferedInputStream(java.io.BufferedInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) TemporaryResources(org.apache.tika.io.TemporaryResources) IOException(java.io.IOException) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) PDFRenderer(org.apache.pdfbox.rendering.PDFRenderer) BufferedImage(java.awt.image.BufferedImage) SAXException(org.xml.sax.SAXException)

Example 3 with IOExceptionWithCause

use of org.apache.commons.io.IOExceptionWithCause in project tika by apache.

the class AbstractPDF2XHTML method endPage.

@Override
protected void endPage(PDPage page) throws IOException {
    try {
        for (PDAnnotation annotation : page.getAnnotations()) {
            if (annotation instanceof PDAnnotationFileAttachment) {
                PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                try {
                    AttributesImpl attributes = new AttributesImpl();
                    attributes.addAttribute("", "source", "source", "CDATA", "annotation");
                    extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, attributes);
                } catch (SAXException e) {
                    throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
                } catch (TikaException e) {
                    throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
                } catch (IOException e) {
                    handleCatchableIOE(e);
                }
            } else if (annotation instanceof PDAnnotationWidget) {
                handleWidget((PDAnnotationWidget) annotation);
            }
            // TODO: remove once PDFBOX-1143 is fixed:
            if (config.getExtractAnnotationText()) {
                PDActionURI uri = getActionURI(annotation);
                if (uri != null) {
                    String link = uri.getURI();
                    if (link != null && link.trim().length() > 0) {
                        xhtml.startElement("div", "class", "annotation");
                        xhtml.startElement("a", "href", link);
                        xhtml.characters(link);
                        xhtml.endElement("a");
                        xhtml.endElement("div");
                    }
                }
                if (annotation instanceof PDAnnotationMarkup) {
                    PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
                    String title = annotationMarkup.getTitlePopup();
                    String subject = annotationMarkup.getSubject();
                    String contents = annotationMarkup.getContents();
                    // TODO: maybe also annotationMarkup.getRichContents()?
                    if (title != null || subject != null || contents != null) {
                        xhtml.startElement("div", "class", "annotation");
                        if (title != null) {
                            xhtml.startElement("div", "class", "annotationTitle");
                            xhtml.characters(title);
                            xhtml.endElement("div");
                        }
                        if (subject != null) {
                            xhtml.startElement("div", "class", "annotationSubject");
                            xhtml.characters(subject);
                            xhtml.endElement("div");
                        }
                        if (contents != null) {
                            xhtml.startElement("div", "class", "annotationContents");
                            xhtml.characters(contents);
                            xhtml.endElement("div");
                        }
                        xhtml.endElement("div");
                    }
                }
            }
        }
        if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
            doOCROnCurrentPage();
        }
        PDPageAdditionalActions pageActions = page.getActions();
        if (pageActions != null) {
            handleDestinationOrAction(pageActions.getC(), ActionTrigger.PAGE_CLOSE);
            handleDestinationOrAction(pageActions.getO(), ActionTrigger.PAGE_OPEN);
        }
        xhtml.endElement("div");
    } catch (SAXException | TikaException e) {
        throw new IOExceptionWithCause("Unable to end a page", e);
    } catch (IOException e) {
        exceptions.add(e);
    } finally {
        pageIndex++;
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) PDAnnotationFileAttachment(org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationFileAttachment) PDAnnotation(org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation) PDAnnotationMarkup(org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationMarkup) IOException(java.io.IOException) PDComplexFileSpecification(org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification) SAXException(org.xml.sax.SAXException) PDPageAdditionalActions(org.apache.pdfbox.pdmodel.interactive.action.PDPageAdditionalActions) IOExceptionWithCause(org.apache.commons.io.IOExceptionWithCause) AttributesImpl(org.xml.sax.helpers.AttributesImpl) PDAnnotationWidget(org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationWidget) PDActionURI(org.apache.pdfbox.pdmodel.interactive.action.PDActionURI)

Example 4 with IOExceptionWithCause

use of org.apache.commons.io.IOExceptionWithCause in project tika by apache.

the class SQLite3DBParser method getConnection.

@Override
protected Connection getConnection(InputStream stream, Metadata metadata, ParseContext context) throws IOException {
    String connectionString = getConnectionString(stream, metadata, context);
    Connection connection = null;
    try {
        Class.forName(getJDBCClassName());
    } catch (ClassNotFoundException e) {
        throw new IOExceptionWithCause(e);
    }
    try {
        SQLiteConfig config = new SQLiteConfig();
        //good habit, but effectively meaningless here
        config.setReadOnly(true);
        connection = config.createConnection(connectionString);
    } catch (SQLException e) {
        throw new IOException(e.getMessage());
    }
    return connection;
}
Also used : IOExceptionWithCause(org.apache.commons.io.IOExceptionWithCause) SQLException(java.sql.SQLException) Connection(java.sql.Connection) SQLiteConfig(org.sqlite.SQLiteConfig) IOException(java.io.IOException)

Example 5 with IOExceptionWithCause

use of org.apache.commons.io.IOExceptionWithCause in project opennms by OpenNMS.

the class VmwareRequisitionUrlConnection method getInputStream.

/**
 * {@inheritDoc}
 * <p/>
 * Creates a ByteArrayInputStream implementation of InputStream of the XML
 * marshaled version of the Requisition class. Calling close on this stream
 * is safe.
 */
@Override
public InputStream getInputStream() throws IOException {
    InputStream stream = null;
    try {
        final Requisition existingRequisition = getExistingRequisition(importRequest.getForeignSource());
        importRequest.setExistingRequisition(existingRequisition);
        final VmwareImporter importer = new VmwareImporter(importRequest);
        stream = new ByteArrayInputStream(jaxBMarshal(importer.getRequisition()).getBytes());
    } catch (Throwable e) {
        logger.warn("Problem getting input stream: '{}'", e);
        throw new IOExceptionWithCause("Problem getting input stream: " + e, e);
    }
    return stream;
}
Also used : IOExceptionWithCause(org.apache.commons.io.IOExceptionWithCause) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Requisition(org.opennms.netmgt.provision.persist.requisition.Requisition)

Aggregations

IOExceptionWithCause (org.apache.commons.io.IOExceptionWithCause)16 IOException (java.io.IOException)10 SQLException (java.sql.SQLException)5 TikaException (org.apache.tika.exception.TikaException)5 SAXException (org.xml.sax.SAXException)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 DataInputStream (java.io.DataInputStream)3 InputStream (java.io.InputStream)3 Connection (java.sql.Connection)2 RepositoryException (javax.jcr.RepositoryException)2 InternalValue (org.apache.jackrabbit.core.value.InternalValue)2 BufferedImage (java.awt.image.BufferedImage)1 BufferedInputStream (java.io.BufferedInputStream)1 DataInput (java.io.DataInput)1 OutputStream (java.io.OutputStream)1 Path (java.nio.file.Path)1 ResultSetMetaData (java.sql.ResultSetMetaData)1 Statement (java.sql.Statement)1 PDComplexFileSpecification (org.apache.pdfbox.pdmodel.common.filespecification.PDComplexFileSpecification)1 PDActionURI (org.apache.pdfbox.pdmodel.interactive.action.PDActionURI)1