use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.
the class XSLFPowerPointExtractorDecorator method getMainDocumentParts.
/**
* In PowerPoint files, slides have things embedded in them,
* and slide drawings which have the images
*/
@Override
protected List<PackagePart> getMainDocumentParts() throws TikaException {
List<PackagePart> parts = new ArrayList<>();
XSLFSlideShow document = null;
try {
document = new XSLFSlideShow(extractor.getPackage());
} catch (Exception e) {
// Shouldn't happen
throw new TikaException(e.getMessage());
}
CTSlideIdList ctSlideIdList = document.getSlideReferences();
if (ctSlideIdList != null) {
for (int i = 0; i < ctSlideIdList.sizeOfSldIdArray(); i++) {
CTSlideIdListEntry ctSlide = ctSlideIdList.getSldIdArray(i);
// Add the slide
PackagePart slidePart;
try {
slidePart = document.getSlidePart(ctSlide);
} catch (IOException e) {
throw new TikaException("Broken OOXML file", e);
} catch (XmlException xe) {
throw new TikaException("Broken OOXML file", xe);
}
addSlideParts(slidePart, parts);
}
}
//add full document to include macros
parts.add(document.getPackagePart());
for (String rel : new String[] { XSLFRelation.SLIDE_MASTER.getRelation(), HANDOUT_MASTER }) {
try {
PackageRelationshipCollection prc = document.getPackagePart().getRelationshipsByType(rel);
for (int i = 0; i < prc.size(); i++) {
PackagePart pp = document.getPackagePart().getRelatedPart(prc.getRelationship(i));
if (pp != null) {
parts.add(pp);
}
}
} catch (InvalidFormatException e) {
//log
}
}
return parts;
}
use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.
the class SXWPFWordExtractorDecorator method handleDocumentPart.
private void handleDocumentPart(PackagePart documentPart, XHTMLContentHandler xhtml) throws IOException, SAXException {
//load the numbering/list manager and styles from the main document part
XWPFNumbering numbering = loadNumbering(documentPart);
XWPFListManager listManager = new XWPFListManager(numbering);
XWPFStylesShim styles = null;
try {
styles = loadStyles(documentPart);
} catch (Exception e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
//headers
try {
PackageRelationshipCollection headersPRC = documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
if (headersPRC != null) {
for (int i = 0; i < headersPRC.size(); i++) {
PackagePart header = documentPart.getRelatedPart(headersPRC.getRelationship(i));
handlePart(header, styles, listManager, xhtml);
}
}
} catch (InvalidFormatException | ZipException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
//main document
try {
handlePart(documentPart, styles, listManager, xhtml);
} catch (ZipException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
//for now, just dump other components at end
for (XWPFRelation rel : new XWPFRelation[] { XWPFRelation.FOOTNOTE, XWPFRelation.COMMENT, XWPFRelation.FOOTER, XWPFRelation.ENDNOTE }) {
try {
PackageRelationshipCollection prc = documentPart.getRelationshipsByType(rel.getRelation());
if (prc != null) {
for (int i = 0; i < prc.size(); i++) {
PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
handlePart(packagePart, styles, listManager, xhtml);
}
}
} catch (InvalidFormatException | ZipException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
}
}
use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.
the class XWPFWordExtractorDecorator method addRelatedParts.
private void addRelatedParts(PackagePart documentPart, List<PackagePart> relatedParts) {
for (String relation : MAIN_PART_RELATIONS) {
PackageRelationshipCollection prc = null;
try {
prc = documentPart.getRelationshipsByType(relation);
if (prc != null) {
for (int i = 0; i < prc.size(); i++) {
PackagePart packagePart = documentPart.getRelatedPart(prc.getRelationship(i));
relatedParts.add(packagePart);
}
}
} catch (InvalidFormatException e) {
}
}
}
use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.
the class XWPFEventBasedWordExtractor method loadHyperlinkRelationships.
private Map<String, String> loadHyperlinkRelationships(PackagePart bodyPart) {
Map<String, String> hyperlinks = new HashMap<>();
try {
PackageRelationshipCollection prc = bodyPart.getRelationshipsByType(XWPFRelation.HYPERLINK.getRelation());
for (int i = 0; i < prc.size(); i++) {
PackageRelationship pr = prc.getRelationship(i);
if (pr == null) {
continue;
}
String id = pr.getId();
String url = (pr.getTargetURI() == null) ? null : pr.getTargetURI().toString();
if (id != null && url != null) {
hyperlinks.put(id, url);
}
}
} catch (InvalidFormatException e) {
LOG.warn("Invalid format", e);
}
return hyperlinks;
}
use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.
the class ZipContainerDetector method detectOPCBased.
private static MediaType detectOPCBased(TikaInputStream stream) {
try {
// if (zip.getEntry("_rels/.rels") != null
// || zip.getEntry("[Content_Types].xml") != null) {
// Use POI to open and investigate it for us
OPCPackage pkg = OPCPackage.open(stream.getFile().getPath(), PackageAccess.READ);
stream.setOpenContainer(pkg);
// Is at an OOXML format?
MediaType type = detectOfficeOpenXML(pkg);
if (type != null)
return type;
// Is it XPS format?
type = detectXPSOPC(pkg);
if (type != null)
return type;
// Is it an AutoCAD format?
type = detectAutoCADOPC(pkg);
if (type != null)
return type;
// We don't know what it is, sorry
return null;
} catch (IOException e) {
return null;
} catch (RuntimeException e) {
return null;
} catch (InvalidFormatException e) {
return null;
}
}
Aggregations