Search in sources :

Example 56 with PackagePart

use of org.apache.poi.openxml4j.opc.PackagePart in project tika by apache.

the class XWPFEventBasedWordExtractor method loadNumbering.

private XWPFNumbering loadNumbering(PackagePart packagePart) {
    try {
        PackageRelationshipCollection numberingParts = packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
        if (numberingParts.size() > 0) {
            PackageRelationship numberingRelationShip = numberingParts.getRelationship(0);
            if (numberingRelationShip == null) {
                return null;
            }
            PackagePart numberingPart = container.getPart(numberingRelationShip);
            if (numberingPart == null) {
                return null;
            }
            return new XWPFNumbering(numberingPart);
        }
    } catch (IOException | OpenXML4JException e) {
        LOG.warn("Couldn't load numbering", e);
    }
    return null;
}
Also used : PackageRelationship(org.apache.poi.openxml4j.opc.PackageRelationship) OpenXML4JException(org.apache.poi.openxml4j.exceptions.OpenXML4JException) XWPFNumbering(org.apache.poi.xwpf.usermodel.XWPFNumbering) PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) IOException(java.io.IOException) PackagePart(org.apache.poi.openxml4j.opc.PackagePart)

Example 57 with PackagePart

use of org.apache.poi.openxml4j.opc.PackagePart in project tika by apache.

the class XWPFEventBasedWordExtractor method handleDocumentPart.

private void handleDocumentPart(PackagePart documentPart, StringBuilder sb) throws IOException, SAXException {
    //load the numbering/list manager and styles from the main document part
    XWPFNumbering numbering = loadNumbering(documentPart);
    XWPFListManager xwpfListManager = new XWPFListManager(numbering);
    //headers
    try {
        PackageRelationshipCollection headersPRC = documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
        if (headersPRC != null) {
            for (int i = 0; i < headersPRC.size(); i++) {
                PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i));
                handlePart(header, xwpfListManager, sb);
            }
        }
    } catch (InvalidFormatException e) {
        LOG.warn("Invalid format", e);
    }
    //main document
    handlePart(documentPart, xwpfListManager, sb);
    //for now, just dump other components at end
    for (XWPFRelation rel : new XWPFRelation[] { XWPFRelation.FOOTNOTE, XWPFRelation.COMMENT, XWPFRelation.FOOTER, XWPFRelation.ENDNOTE }) {
        try {
            PackageRelationshipCollection prc = documentPart.getRelationshipsByType(rel.getRelation());
            if (prc != null) {
                for (int i = 0; i < prc.size(); i++) {
                    PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
                    handlePart(packagePart, xwpfListManager, sb);
                }
            }
        } catch (InvalidFormatException e) {
            LOG.warn("Invalid format", e);
        }
    }
}
Also used : XWPFRelation(org.apache.poi.xwpf.usermodel.XWPFRelation) XWPFNumbering(org.apache.poi.xwpf.usermodel.XWPFNumbering) PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) XWPFListManager(org.apache.tika.parser.microsoft.ooxml.XWPFListManager) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException)

Example 58 with PackagePart

use of org.apache.poi.openxml4j.opc.PackagePart in project tika by apache.

the class OOXMLExtractorFactory method trySXSLF.

private static POIXMLTextExtractor trySXSLF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException {
    PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
    if (packageRelationshipCollection.size() == 0) {
        packageRelationshipCollection = pkg.getRelationshipsByType("http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument");
    }
    if (packageRelationshipCollection.size() == 0) {
        return null;
    }
    PackagePart corePart = pkg.getPart(packageRelationshipCollection.getRelationship(0));
    String targetContentType = corePart.getContentType();
    XSLFRelation[] xslfRelations = org.apache.poi.xslf.extractor.XSLFPowerPointExtractor.SUPPORTED_TYPES;
    for (int i = 0; i < xslfRelations.length; i++) {
        XSLFRelation xslfRelation = xslfRelations[i];
        if (xslfRelation.getContentType().equals(targetContentType)) {
            return new XSLFEventBasedPowerPointExtractor(pkg);
        }
    }
    if (XSLFRelation.THEME_MANAGER.getContentType().equals(targetContentType)) {
        return new XSLFEventBasedPowerPointExtractor(pkg);
    }
    return null;
}
Also used : PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) XSLFRelation(org.apache.poi.xslf.usermodel.XSLFRelation) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) XSLFEventBasedPowerPointExtractor(org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor)

Example 59 with PackagePart

use of org.apache.poi.openxml4j.opc.PackagePart in project tika by apache.

the class SXSLFPowerPointExtractorDecorator method handleBasicRelatedParts.

/**
     * This should handle the comments, master, notes, etc
     *
     * @param contentType
     * @param xhtmlClassLabel
     * @param parentPart
     * @param contentHandler
     */
private void handleBasicRelatedParts(String contentType, String xhtmlClassLabel, PackagePart parentPart, ContentHandler contentHandler) throws SAXException {
    PackageRelationshipCollection relatedPartPRC = null;
    try {
        relatedPartPRC = parentPart.getRelationshipsByType(contentType);
    } catch (InvalidFormatException e) {
        metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
    }
    if (relatedPartPRC != null && relatedPartPRC.size() > 0) {
        AttributesImpl attributes = new AttributesImpl();
        attributes.addAttribute("", "class", "class", "CDATA", xhtmlClassLabel);
        contentHandler.startElement("", "div", "div", attributes);
        for (int i = 0; i < relatedPartPRC.size(); i++) {
            PackageRelationship relatedPartPackageRelationship = relatedPartPRC.getRelationship(i);
            try {
                PackagePart relatedPartPart = parentPart.getRelatedPart(relatedPartPackageRelationship);
                try (InputStream stream = relatedPartPart.getInputStream()) {
                    context.getSAXParser().parse(stream, new OfflineContentHandler(new EmbeddedContentHandler(contentHandler)));
                } catch (IOException | TikaException e) {
                    metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
                }
            } catch (InvalidFormatException e) {
                metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
            }
        }
        contentHandler.endElement("", "div", "div");
    }
}
Also used : PackageRelationship(org.apache.poi.openxml4j.opc.PackageRelationship) AttributesImpl(org.xml.sax.helpers.AttributesImpl) OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) TikaException(org.apache.tika.exception.TikaException) PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) InputStream(java.io.InputStream) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) IOException(java.io.IOException) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException)

Example 60 with PackagePart

use of org.apache.poi.openxml4j.opc.PackagePart in project tika by apache.

the class SXWPFWordExtractorDecorator method addRelatedParts.

private void addRelatedParts(PackagePart documentPart, List<PackagePart> relatedParts) {
    for (String relation : MAIN_PART_RELATIONS) {
        PackageRelationshipCollection prc = null;
        try {
            prc = documentPart.getRelationshipsByType(relation);
            if (prc != null) {
                for (int i = 0; i < prc.size(); i++) {
                    PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
                    relatedParts.add(packagePart);
                }
            }
        } catch (InvalidFormatException e) {
        }
    }
}
Also used : PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException)

Aggregations

PackagePart (org.apache.poi.openxml4j.opc.PackagePart)118 OutputStream (java.io.OutputStream)38 PackageRelationship (org.apache.poi.openxml4j.opc.PackageRelationship)27 OPCPackage (org.apache.poi.openxml4j.opc.OPCPackage)25 InvalidFormatException (org.apache.poi.openxml4j.exceptions.InvalidFormatException)24 PackageRelationshipCollection (org.apache.poi.openxml4j.opc.PackageRelationshipCollection)23 PackagePartName (org.apache.poi.openxml4j.opc.PackagePartName)19 QName (javax.xml.namespace.QName)18 IOException (java.io.IOException)17 XmlOptions (org.apache.xmlbeans.XmlOptions)17 InputStream (java.io.InputStream)11 Test (org.junit.Test)11 ByteArrayOutputStream (java.io.ByteArrayOutputStream)9 POIXMLException (org.apache.poi.POIXMLException)8 XmlException (org.apache.xmlbeans.XmlException)8 OpenXML4JException (org.apache.poi.openxml4j.exceptions.OpenXML4JException)7 ArrayList (java.util.ArrayList)6 TikaException (org.apache.tika.exception.TikaException)6 URI (java.net.URI)5 SAXException (org.xml.sax.SAXException)5