Search in sources :

Example 21 with PackageRelationshipCollection

use of org.apache.poi.openxml4j.opc.PackageRelationshipCollection in project tika by apache.

the class SXSLFPowerPointExtractorDecorator method getMainDocumentParts.

/**
     * In PowerPoint files, slides have things embedded in them,
     * and slide drawings which have the images
     */
@Override
protected List<PackagePart> getMainDocumentParts() {
    List<PackagePart> parts = new ArrayList<>();
    //TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/embeddings/.*?
    //TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/media/.*?
    PackageRelationshipCollection slidePRC = null;
    try {
        slidePRC = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
    } catch (InvalidFormatException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }
    if (slidePRC != null) {
        for (int i = 0; i < slidePRC.size(); i++) {
            PackagePart slidePart = null;
            try {
                slidePart = mainDocument.getRelatedPart(slidePRC.getRelationship(i));
            } catch (InvalidFormatException e) {
                metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
            }
            addSlideParts(slidePart, parts);
        }
    }
    parts.add(mainDocument);
    for (String rel : new String[] { XSLFRelation.SLIDE_MASTER.getRelation(), HANDOUT_MASTER }) {
        PackageRelationshipCollection prc = null;
        try {
            prc = mainDocument.getRelationshipsByType(rel);
        } catch (InvalidFormatException e) {
            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
        }
        if (prc != null) {
            for (int i = 0; i < prc.size(); i++) {
                PackagePart pp = null;
                try {
                    pp = mainDocument.getRelatedPart(prc.getRelationship(i));
                } catch (InvalidFormatException e) {
                    metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
                }
                if (pp != null) {
                    parts.add(pp);
                }
            }
        }
    }
    return parts;
}
Also used : PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) ArrayList(java.util.ArrayList) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException)

Example 22 with PackageRelationshipCollection

use of org.apache.poi.openxml4j.opc.PackageRelationshipCollection in project tika by apache.

the class XSLFPowerPointExtractorDecorator method getMainDocumentParts.

/**
     * In PowerPoint files, slides have things embedded in them,
     * and slide drawings which have the images
     */
@Override
protected List<PackagePart> getMainDocumentParts() throws TikaException {
    List<PackagePart> parts = new ArrayList<>();
    XSLFSlideShow document = null;
    try {
        document = new XSLFSlideShow(extractor.getPackage());
    } catch (Exception e) {
        // Shouldn't happen
        throw new TikaException(e.getMessage());
    }
    CTSlideIdList ctSlideIdList = document.getSlideReferences();
    if (ctSlideIdList != null) {
        for (int i = 0; i < ctSlideIdList.sizeOfSldIdArray(); i++) {
            CTSlideIdListEntry ctSlide = ctSlideIdList.getSldIdArray(i);
            // Add the slide
            PackagePart slidePart;
            try {
                slidePart = document.getSlidePart(ctSlide);
            } catch (IOException e) {
                throw new TikaException("Broken OOXML file", e);
            } catch (XmlException xe) {
                throw new TikaException("Broken OOXML file", xe);
            }
            addSlideParts(slidePart, parts);
        }
    }
    //add full document to include macros
    parts.add(document.getPackagePart());
    for (String rel : new String[] { XSLFRelation.SLIDE_MASTER.getRelation(), HANDOUT_MASTER }) {
        try {
            PackageRelationshipCollection prc = document.getPackagePart().getRelationshipsByType(rel);
            for (int i = 0; i < prc.size(); i++) {
                PackagePart pp = document.getPackagePart().getRelatedPart(prc.getRelationship(i));
                if (pp != null) {
                    parts.add(pp);
                }
            }
        } catch (InvalidFormatException e) {
        //log
        }
    }
    return parts;
}
Also used : TikaException(org.apache.tika.exception.TikaException) PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) ArrayList(java.util.ArrayList) IOException(java.io.IOException) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) CTSlideIdListEntry(org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException) TikaException(org.apache.tika.exception.TikaException) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException) IOException(java.io.IOException) XmlException(org.apache.xmlbeans.XmlException) SAXException(org.xml.sax.SAXException) CTSlideIdList(org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList) XmlException(org.apache.xmlbeans.XmlException)

Example 23 with PackageRelationshipCollection

use of org.apache.poi.openxml4j.opc.PackageRelationshipCollection in project tika by apache.

the class SXWPFWordExtractorDecorator method handleDocumentPart.

private void handleDocumentPart(PackagePart documentPart, XHTMLContentHandler xhtml) throws IOException, SAXException {
    //load the numbering/list manager and styles from the main document part
    XWPFNumbering numbering = loadNumbering(documentPart);
    XWPFListManager listManager = new XWPFListManager(numbering);
    XWPFStylesShim styles = null;
    try {
        styles = loadStyles(documentPart);
    } catch (Exception e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }
    //headers
    try {
        PackageRelationshipCollection headersPRC = documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
        if (headersPRC != null) {
            for (int i = 0; i < headersPRC.size(); i++) {
                PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i));
                handlePart(header, styles, listManager, xhtml);
            }
        }
    } catch (InvalidFormatException | ZipException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }
    //main document
    try {
        handlePart(documentPart, styles, listManager, xhtml);
    } catch (ZipException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }
    //for now, just dump other components at end
    for (XWPFRelation rel : new XWPFRelation[] { XWPFRelation.FOOTNOTE, XWPFRelation.COMMENT, XWPFRelation.FOOTER, XWPFRelation.ENDNOTE }) {
        try {
            PackageRelationshipCollection prc = documentPart.getRelationshipsByType(rel.getRelation());
            if (prc != null) {
                for (int i = 0; i < prc.size(); i++) {
                    PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
                    handlePart(packagePart, styles, listManager, xhtml);
                }
            }
        } catch (InvalidFormatException | ZipException e) {
            metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
        }
    }
}
Also used : XWPFRelation(org.apache.poi.xwpf.usermodel.XWPFRelation) XWPFNumbering(org.apache.poi.xwpf.usermodel.XWPFNumbering) PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) XWPFStylesShim(org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim) ZipException(java.util.zip.ZipException) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException) TikaException(org.apache.tika.exception.TikaException) ZipException(java.util.zip.ZipException) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException) OpenXML4JException(org.apache.poi.openxml4j.exceptions.OpenXML4JException) IOException(java.io.IOException) XmlException(org.apache.xmlbeans.XmlException) SAXException(org.xml.sax.SAXException)

Example 24 with PackageRelationshipCollection

use of org.apache.poi.openxml4j.opc.PackageRelationshipCollection in project tika by apache.

the class XWPFWordExtractorDecorator method addRelatedParts.

private void addRelatedParts(PackagePart documentPart, List<PackagePart> relatedParts) {
    for (String relation : MAIN_PART_RELATIONS) {
        PackageRelationshipCollection prc = null;
        try {
            prc = documentPart.getRelationshipsByType(relation);
            if (prc != null) {
                for (int i = 0; i < prc.size(); i++) {
                    PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
                    relatedParts.add(packagePart);
                }
            }
        } catch (InvalidFormatException e) {
        }
    }
}
Also used : PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException)

Example 25 with PackageRelationshipCollection

use of org.apache.poi.openxml4j.opc.PackageRelationshipCollection in project tika by apache.

the class XWPFEventBasedWordExtractor method loadHyperlinkRelationships.

private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
    Map<String, String> hyperlinks = new HashMap<>();
    try {
        PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
        for (int i = 0; i < prc.size(); i++) {
            PackageRelationship pr = prc.getRelationship(i);
            if (pr == null) {
                continue;
            }
            String id = pr.getId();
            String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
            if (id != null && url != null) {
                hyperlinks.put(id, url);
            }
        }
    } catch (InvalidFormatException e) {
        LOG.warn("Invalid format", e);
    }
    return hyperlinks;
}
Also used : PackageRelationship(org.apache.poi.openxml4j.opc.PackageRelationship) HashMap(java.util.HashMap) PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException)

Aggregations

PackageRelationshipCollection (org.apache.poi.openxml4j.opc.PackageRelationshipCollection)29 PackagePart (org.apache.poi.openxml4j.opc.PackagePart)23 InvalidFormatException (org.apache.poi.openxml4j.exceptions.InvalidFormatException)15 PackageRelationship (org.apache.poi.openxml4j.opc.PackageRelationship)14 IOException (java.io.IOException)8 XmlException (org.apache.xmlbeans.XmlException)5 ArrayList (java.util.ArrayList)4 HashMap (java.util.HashMap)4 OpenXML4JException (org.apache.poi.openxml4j.exceptions.OpenXML4JException)4 OPCPackage (org.apache.poi.openxml4j.opc.OPCPackage)4 PackagePartName (org.apache.poi.openxml4j.opc.PackagePartName)4 XWPFRelation (org.apache.poi.xwpf.usermodel.XWPFRelation)4 TikaException (org.apache.tika.exception.TikaException)4 SAXException (org.xml.sax.SAXException)4 ByteArrayOutputStream (java.io.ByteArrayOutputStream)2 File (java.io.File)2 OutputStream (java.io.OutputStream)2 URI (java.net.URI)2 ZipException (java.util.zip.ZipException)2 XMLSignatureException (javax.xml.crypto.dsig.XMLSignatureException)2