Search in sources :

Example 51 with InvalidFormatException

use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.

the class XSSFBExcelExtractorDecorator method getMainDocumentParts.

/**
     * In Excel files, sheets have things embedded in them,
     * and sheet drawings which have the images
     */
@Override
protected List<PackagePart> getMainDocumentParts() throws TikaException {
    List<PackagePart> parts = new ArrayList<PackagePart>();
    for (PackagePart part : sheetParts) {
        // Add the sheet
        parts.add(part);
        // If it has drawings, return those too
        try {
            for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
                if (rel.getTargetMode() == TargetMode.INTERNAL) {
                    PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
                    parts.add(rel.getPackage().getPart(relName));
                }
            }
            for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
                if (rel.getTargetMode() == TargetMode.INTERNAL) {
                    PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
                    parts.add(rel.getPackage().getPart(relName));
                }
            }
        } catch (InvalidFormatException e) {
            throw new TikaException("Broken OOXML file", e);
        }
    }
    //by AbstractOOXMLExtractor
    for (PackagePart part : extractor.getPackage().getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
        parts.add(part);
    }
    return parts;
}
Also used : PackageRelationship(org.apache.poi.openxml4j.opc.PackageRelationship) PackagePartName(org.apache.poi.openxml4j.opc.PackagePartName) TikaException(org.apache.tika.exception.TikaException) ArrayList(java.util.ArrayList) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException)

Example 52 with InvalidFormatException

use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.

the class XSSFExcelExtractorDecorator method addDrawingHyperLinks.

protected void addDrawingHyperLinks(PackagePart sheetPart) {
    try {
        for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
            if (rel.getTargetMode() == TargetMode.INTERNAL) {
                PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
                PackagePart part = rel.getPackage().getPart(relName);
                //parts can go missing, and Excel quietly ignores missing images -- TIKA-2134
                if (part == null) {
                    continue;
                }
                for (PackageRelationship drawRel : part.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
                    drawingHyperlinks.put(drawRel.getId(), drawRel.getTargetURI().toString());
                }
            }
        }
    } catch (InvalidFormatException e) {
    //swallow
    //an exception trying to extract
    //hyperlinks on drawings should not cause a parse failure
    }
}
Also used : PackageRelationship(org.apache.poi.openxml4j.opc.PackageRelationship) PackagePartName(org.apache.poi.openxml4j.opc.PackagePartName) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException)

Example 53 with InvalidFormatException

use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.

the class OOXMLExtractorFactory method parse.

public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    Locale locale = context.get(Locale.class, Locale.getDefault());
    ExtractorFactory.setThreadPrefersEventExtractors(true);
    try {
        OOXMLExtractor extractor;
        OPCPackage pkg;
        // Locate or Open the OPCPackage for the file
        TikaInputStream tis = TikaInputStream.cast(stream);
        if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
            pkg = (OPCPackage) tis.getOpenContainer();
        } else if (tis != null && tis.hasFile()) {
            pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
            tis.setOpenContainer(pkg);
        } else {
            InputStream shield = new CloseShieldInputStream(stream);
            pkg = OPCPackage.open(shield);
        }
        // Get the type, and ensure it's one we handle
        MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
        if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
            // Not a supported type, delegate to Empty Parser
            EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
            return;
        }
        metadata.set(Metadata.CONTENT_TYPE, type.toString());
        // Have the appropriate OOXML text extractor picked
        POIXMLTextExtractor poiExtractor = null;
        // This has already been set by OOXMLParser's call to configure()
        // We can rely on this being non-null.
        OfficeParserConfig config = context.get(OfficeParserConfig.class);
        if (config.getUseSAXDocxExtractor()) {
            poiExtractor = trySXWPF(pkg);
        }
        if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
            poiExtractor = trySXSLF(pkg);
        }
        if (poiExtractor == null) {
            poiExtractor = ExtractorFactory.createExtractor(pkg);
        }
        POIXMLDocument document = poiExtractor.getDocument();
        if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
            extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
            extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
            extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
        } else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
            extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
        } else if (document == null) {
            throw new TikaException("Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor);
        } else if (document instanceof XMLSlideShow) {
            extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
        } else if (document instanceof XWPFDocument) {
            extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
        } else {
            extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
        }
        // Get the bulk of the metadata first, so that it's accessible during
        //  parsing if desired by the client (see TIKA-1109)
        extractor.getMetadataExtractor().extract(metadata);
        // Extract the text, along with any in-document metadata
        extractor.getXHTML(baseHandler, metadata, context);
    } catch (IllegalArgumentException e) {
        if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
            throw new TikaException("TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
        } else {
            throw new TikaException("Error creating OOXML extractor", e);
        }
    } catch (InvalidFormatException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (OpenXML4JException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (XmlException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    }
}
Also used : Locale(java.util.Locale) TikaInputStream(org.apache.tika.io.TikaInputStream) XWPFEventBasedWordExtractor(org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException) OpenXML4JException(org.apache.poi.openxml4j.exceptions.OpenXML4JException) XSSFEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) MediaType(org.apache.tika.mime.MediaType) XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument) XSLFEventBasedPowerPointExtractor(org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor) TikaException(org.apache.tika.exception.TikaException) XSSFBEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) POIXMLDocument(org.apache.poi.POIXMLDocument) POIXMLTextExtractor(org.apache.poi.POIXMLTextExtractor) XmlException(org.apache.xmlbeans.XmlException) XMLSlideShow(org.apache.poi.xslf.usermodel.XMLSlideShow) OPCPackage(org.apache.poi.openxml4j.opc.OPCPackage) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 54 with InvalidFormatException

use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.

the class SXSLFPowerPointExtractorDecorator method buildXHTML.

/**
     * @see XSLFPowerPointExtractor#getText()
     */
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
    loadCommentAuthors();
    PackageRelationshipCollection slidesPRC = null;
    try {
        slidesPRC = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
    } catch (InvalidFormatException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }
    if (slidesPRC != null && slidesPRC.size() > 0) {
        for (int i = 0; i < slidesPRC.size(); i++) {
            try {
                handleSlidePart(mainDocument.getRelatedPart(slidesPRC.getRelationship(i)), xhtml);
            } catch (InvalidFormatException | ZipException e) {
                metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
            }
        }
    }
    handleBasicRelatedParts(XSLFRelation.SLIDE_MASTER.getRelation(), "slide-master", mainDocument, new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), new HashMap<String, String>())));
    handleBasicRelatedParts(HANDOUT_MASTER, "slide-handout-master", mainDocument, new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), new HashMap<String, String>()));
}
Also used : PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) HashMap(java.util.HashMap) ZipException(java.util.zip.ZipException) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException)

Example 55 with InvalidFormatException

use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.

the class SXSLFPowerPointExtractorDecorator method getMainDocumentParts.

/**
     * In PowerPoint files, slides have things embedded in them,
     * and slide drawings which have the images
     */
@Override
protected List<PackagePart> getMainDocumentParts() {
    List<PackagePart> parts = new ArrayList<>();
    //TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/embeddings/.*?
    //TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/media/.*?
    PackageRelationshipCollection slidePRC = null;
    try {
        slidePRC = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
    } catch (InvalidFormatException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }
    if (slidePRC != null) {
        for (int i = 0; i < slidePRC.size(); i++) {
            PackagePart slidePart = null;
            try {
                slidePart = mainDocument.getRelatedPart(slidePRC.getRelationship(i));
            } catch (InvalidFormatException e) {
                metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
            }
            addSlideParts(slidePart, parts);
        }
    }
    parts.add(mainDocument);
    for (String rel : new String[] { XSLFRelation.SLIDE_MASTER.getRelation(), HANDOUT_MASTER }) {
        PackageRelationshipCollection prc = null;
        try {
            prc = mainDocument.getRelationshipsByType(rel);
        } catch (InvalidFormatException e) {
            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
        }
        if (prc != null) {
            for (int i = 0; i < prc.size(); i++) {
                PackagePart pp = null;
                try {
                    pp = mainDocument.getRelatedPart(prc.getRelationship(i));
                } catch (InvalidFormatException e) {
                    metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
                }
                if (pp != null) {
                    parts.add(pp);
                }
            }
        }
    }
    return parts;
}
Also used : PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) ArrayList(java.util.ArrayList) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException)

Aggregations

InvalidFormatException (org.apache.poi.openxml4j.exceptions.InvalidFormatException)72 IOException (java.io.IOException)24 PackagePart (org.apache.poi.openxml4j.opc.PackagePart)22 PackageRelationship (org.apache.poi.openxml4j.opc.PackageRelationship)18 OPCPackage (org.apache.poi.openxml4j.opc.OPCPackage)17 PackagePartName (org.apache.poi.openxml4j.opc.PackagePartName)16 PackageRelationshipCollection (org.apache.poi.openxml4j.opc.PackageRelationshipCollection)15 InputStream (java.io.InputStream)12 InvalidOperationException (org.apache.poi.openxml4j.exceptions.InvalidOperationException)11 Test (org.junit.Test)10 URI (java.net.URI)9 POIXMLException (org.apache.poi.POIXMLException)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 ArrayList (java.util.ArrayList)7 TikaException (org.apache.tika.exception.TikaException)7 XmlException (org.apache.xmlbeans.XmlException)7 ByteArrayOutputStream (java.io.ByteArrayOutputStream)6 FileNotFoundException (java.io.FileNotFoundException)6 HashMap (java.util.HashMap)6 Workbook (org.apache.poi.ss.usermodel.Workbook)6