Search in sources :

Example 41 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class XSSFExcelExtractorDecorator method getMainDocumentParts.

/**
     * In Excel files, sheets have things embedded in them,
     * and sheet drawings which have the images
     */
@Override
protected List<PackagePart> getMainDocumentParts() throws TikaException {
    List<PackagePart> parts = new ArrayList<PackagePart>();
    for (PackagePart part : sheetParts) {
        // Add the sheet
        parts.add(part);
        // If it has drawings, return those too
        try {
            for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
                if (rel.getTargetMode() == TargetMode.INTERNAL) {
                    PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
                    parts.add(rel.getPackage().getPart(relName));
                }
            }
            for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
                if (rel.getTargetMode() == TargetMode.INTERNAL) {
                    PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
                    parts.add(rel.getPackage().getPart(relName));
                }
            }
        } catch (InvalidFormatException e) {
            throw new TikaException("Broken OOXML file", e);
        }
    }
    //by AbstractOOXMLExtractor
    for (PackagePart part : extractor.getPackage().getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
        parts.add(part);
    }
    return parts;
}
Also used : PackageRelationship(org.apache.poi.openxml4j.opc.PackageRelationship) PackagePartName(org.apache.poi.openxml4j.opc.PackagePartName) TikaException(org.apache.tika.exception.TikaException) ArrayList(java.util.ArrayList) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException)

Example 42 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class OCR2XHTML method process.

/**
     * Converts the given PDF document (and related metadata) to a stream
     * of XHTML SAX events sent to the given content handler.
     *
     * @param document PDF document
     * @param handler  SAX content handler
     * @param metadata PDF metadata
     * @throws SAXException  if the content handler fails to process SAX events
     * @throws TikaException if there was an exception outside of per page processing
     */
public static void process(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws SAXException, TikaException {
    OCR2XHTML ocr2XHTML = null;
    try {
        ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config);
        ocr2XHTML.writeText(document, new Writer() {

            @Override
            public void write(char[] cbuf, int off, int len) {
            }

            @Override
            public void flush() {
            }

            @Override
            public void close() {
            }
        });
    } catch (IOException e) {
        if (e.getCause() instanceof SAXException) {
            throw (SAXException) e.getCause();
        } else {
            throw new TikaException("Unable to extract PDF content", e);
        }
    }
    if (ocr2XHTML.exceptions.size() > 0) {
        //throw the first
        throw new TikaException("Unable to extract all PDF content", ocr2XHTML.exceptions.get(0));
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) Writer(java.io.Writer) SAXException(org.xml.sax.SAXException)

Example 43 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class PDF2XHTML method process.

/**
     * Converts the given PDF document (and related metadata) to a stream
     * of XHTML SAX events sent to the given content handler.
     *
     * @param document PDF document
     * @param handler  SAX content handler
     * @param metadata PDF metadata
     * @throws SAXException  if the content handler fails to process SAX events
     * @throws TikaException if there was an exception outside of per page processing
     */
public static void process(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, PDFParserConfig config) throws SAXException, TikaException {
    PDF2XHTML pdf2XHTML = null;
    try {
        // Extract text using a dummy Writer as we override the
        // key methods to output to the given content
        // handler.
        pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config);
        config.configure(pdf2XHTML);
        pdf2XHTML.writeText(document, new Writer() {

            @Override
            public void write(char[] cbuf, int off, int len) {
            }

            @Override
            public void flush() {
            }

            @Override
            public void close() {
            }
        });
    } catch (IOException e) {
        if (e.getCause() instanceof SAXException) {
            throw (SAXException) e.getCause();
        } else {
            throw new TikaException("Unable to extract PDF content", e);
        }
    }
    if (pdf2XHTML.exceptions.size() > 0) {
        //throw the first
        throw new TikaException("Unable to extract PDF content", pdf2XHTML.exceptions.get(0));
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) Writer(java.io.Writer) SAXException(org.xml.sax.SAXException)

Example 44 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class PDFParser method loadDOM.

//can return null!
private Document loadDOM(PDMetadata pdMetadata, Metadata metadata, ParseContext context) {
    if (pdMetadata == null) {
        return null;
    }
    InputStream is = null;
    try {
        try {
            is = pdMetadata.exportXMPMetadata();
        } catch (IOException e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
            return null;
        }
        DocumentBuilder documentBuilder = context.getDocumentBuilder();
        documentBuilder.setErrorHandler((ErrorHandler) null);
        return documentBuilder.parse(is);
    } catch (IOException | SAXException | TikaException e) {
        EmbeddedDocumentUtil.recordException(e, metadata);
    } finally {
        IOUtils.closeQuietly(is);
    }
    return null;
}
Also used : TikaException(org.apache.tika.exception.TikaException) DocumentBuilder(javax.xml.parsers.DocumentBuilder) ByteArrayInputStream(java.io.ByteArrayInputStream) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException)

Example 45 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class RarParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
    Archive rar = null;
    try (TemporaryResources tmp = new TemporaryResources()) {
        TikaInputStream tis = TikaInputStream.get(stream, tmp);
        rar = new Archive(tis.getFile());
        if (rar.isEncrypted()) {
            throw new EncryptedDocumentException();
        }
        //Without this BodyContentHandler does not work
        xhtml.element("div", " ");
        FileHeader header = rar.nextFileHeader();
        while (header != null && !Thread.currentThread().isInterrupted()) {
            if (!header.isDirectory()) {
                try (InputStream subFile = rar.getInputStream(header)) {
                    Metadata entrydata = PackageParser.handleEntryMetadata("".equals(header.getFileNameW()) ? header.getFileNameString() : header.getFileNameW(), header.getCTime(), header.getMTime(), header.getFullUnpackSize(), xhtml);
                    if (extractor.shouldParseEmbedded(entrydata)) {
                        extractor.parseEmbedded(subFile, handler, entrydata, true);
                    }
                }
            }
            header = rar.nextFileHeader();
        }
    } catch (RarException e) {
        throw new TikaException("RarParser Exception", e);
    } finally {
        if (rar != null)
            rar.close();
    }
    xhtml.endDocument();
}
Also used : Archive(com.github.junrar.Archive) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) TikaException(org.apache.tika.exception.TikaException) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) TemporaryResources(org.apache.tika.io.TemporaryResources) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) RarException(com.github.junrar.exception.RarException) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) FileHeader(com.github.junrar.rarfile.FileHeader)

Aggregations

TikaException (org.apache.tika.exception.TikaException)142 IOException (java.io.IOException)54 SAXException (org.xml.sax.SAXException)42 InputStream (java.io.InputStream)37 TikaInputStream (org.apache.tika.io.TikaInputStream)33 Metadata (org.apache.tika.metadata.Metadata)33 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)29 Test (org.junit.Test)19 ParseContext (org.apache.tika.parser.ParseContext)18 ContentHandler (org.xml.sax.ContentHandler)17 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)16 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)15 TemporaryResources (org.apache.tika.io.TemporaryResources)15 MediaType (org.apache.tika.mime.MediaType)13 Parser (org.apache.tika.parser.Parser)13 ByteArrayInputStream (java.io.ByteArrayInputStream)12 ArrayList (java.util.ArrayList)11 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)11 File (java.io.File)8 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)8