use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.
the class XSSFBExcelExtractorDecorator method getMainDocumentParts.
/**
* In Excel files, sheets have things embedded in them,
* and sheet drawings which have the images
*/
@Override
protected List<PackagePart> getMainDocumentParts() throws TikaException {
List<PackagePart> parts = new ArrayList<PackagePart>();
for (PackagePart part : sheetParts) {
// Add the sheet
parts.add(part);
// If it has drawings, return those too
try {
for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
parts.add(rel.getPackage().getPart(relName));
}
}
for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
parts.add(rel.getPackage().getPart(relName));
}
}
} catch (InvalidFormatException e) {
throw new TikaException("Broken OOXML file", e);
}
}
//by AbstractOOXMLExtractor
for (PackagePart part : extractor.getPackage().getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
parts.add(part);
}
return parts;
}
use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.
the class XSSFExcelExtractorDecorator method addDrawingHyperLinks.
protected void addDrawingHyperLinks(PackagePart sheetPart) {
try {
for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
PackagePart part = rel.getPackage().getPart(relName);
//parts can go missing, and Excel quietly ignores missing images -- TIKA-2134
if (part == null) {
continue;
}
for (PackageRelationship drawRel : part.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
drawingHyperlinks.put(drawRel.getId(), drawRel.getTargetURI().toString());
}
}
}
} catch (InvalidFormatException e) {
//swallow
//an exception trying to extract
//hyperlinks on drawings should not cause a parse failure
}
}
use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.
the class OOXMLExtractorFactory method parse.
public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
Locale locale = context.get(Locale.class, Locale.getDefault());
ExtractorFactory.setThreadPrefersEventExtractors(true);
try {
OOXMLExtractor extractor;
OPCPackage pkg;
// Locate or Open the OPCPackage for the file
TikaInputStream tis = TikaInputStream.cast(stream);
if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
pkg = (OPCPackage) tis.getOpenContainer();
} else if (tis != null && tis.hasFile()) {
pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
tis.setOpenContainer(pkg);
} else {
InputStream shield = new CloseShieldInputStream(stream);
pkg = OPCPackage.open(shield);
}
// Get the type, and ensure it's one we handle
MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
// Not a supported type, delegate to Empty Parser
EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
return;
}
metadata.set(Metadata.CONTENT_TYPE, type.toString());
// Have the appropriate OOXML text extractor picked
POIXMLTextExtractor poiExtractor = null;
// This has already been set by OOXMLParser's call to configure()
// We can rely on this being non-null.
OfficeParserConfig config = context.get(OfficeParserConfig.class);
if (config.getUseSAXDocxExtractor()) {
poiExtractor = trySXWPF(pkg);
}
if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
poiExtractor = trySXSLF(pkg);
}
if (poiExtractor == null) {
poiExtractor = ExtractorFactory.createExtractor(pkg);
}
POIXMLDocument document = poiExtractor.getDocument();
if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
} else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor);
metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
} else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor);
metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
} else if (document == null) {
throw new TikaException("Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor);
} else if (document instanceof XMLSlideShow) {
extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
} else if (document instanceof XWPFDocument) {
extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
} else {
extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
}
// Get the bulk of the metadata first, so that it's accessible during
// parsing if desired by the client (see TIKA-1109)
extractor.getMetadataExtractor().extract(metadata);
// Extract the text, along with any in-document metadata
extractor.getXHTML(baseHandler, metadata, context);
} catch (IllegalArgumentException e) {
if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
throw new TikaException("TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
} else {
throw new TikaException("Error creating OOXML extractor", e);
}
} catch (InvalidFormatException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (OpenXML4JException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (XmlException e) {
throw new TikaException("Error creating OOXML extractor", e);
}
}
use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.
the class SXSLFPowerPointExtractorDecorator method buildXHTML.
/**
* @see XSLFPowerPointExtractor#getText()
*/
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException, IOException {
loadCommentAuthors();
PackageRelationshipCollection slidesPRC = null;
try {
slidesPRC = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
if (slidesPRC != null && slidesPRC.size() > 0) {
for (int i = 0; i < slidesPRC.size(); i++) {
try {
handleSlidePart(mainDocument.getRelatedPart(slidesPRC.getRelationship(i)), xhtml);
} catch (InvalidFormatException | ZipException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
}
}
handleBasicRelatedParts(XSLFRelation.SLIDE_MASTER.getRelation(), "slide-master", mainDocument, new PlaceHolderSkipper(new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), new HashMap<String, String>())));
handleBasicRelatedParts(HANDOUT_MASTER, "slide-handout-master", mainDocument, new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml), new HashMap<String, String>()));
}
use of org.apache.poi.openxml4j.exceptions.InvalidFormatException in project tika by apache.
the class SXSLFPowerPointExtractorDecorator method getMainDocumentParts.
/**
* In PowerPoint files, slides have things embedded in them,
* and slide drawings which have the images
*/
@Override
protected List<PackagePart> getMainDocumentParts() {
List<PackagePart> parts = new ArrayList<>();
//TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/embeddings/.*?
//TODO: consider: getPackage().getPartsByName(Pattern.compile("/ppt/media/.*?
PackageRelationshipCollection slidePRC = null;
try {
slidePRC = mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
if (slidePRC != null) {
for (int i = 0; i < slidePRC.size(); i++) {
PackagePart slidePart = null;
try {
slidePart = mainDocument.getRelatedPart(slidePRC.getRelationship(i));
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
addSlideParts(slidePart, parts);
}
}
parts.add(mainDocument);
for (String rel : new String[] { XSLFRelation.SLIDE_MASTER.getRelation(), HANDOUT_MASTER }) {
PackageRelationshipCollection prc = null;
try {
prc = mainDocument.getRelationshipsByType(rel);
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
if (prc != null) {
for (int i = 0; i < prc.size(); i++) {
PackagePart pp = null;
try {
pp = mainDocument.getRelatedPart(prc.getRelationship(i));
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, ExceptionUtils.getStackTrace(e));
}
if (pp != null) {
parts.add(pp);
}
}
}
}
return parts;
}
Aggregations