Search in sources :

Example 1 with XWPFEventBasedWordExtractor

use of org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor in project tika by apache.

the class XSLFEventBasedPowerPointExtractor method main.

public static void main(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("Use:");
        System.err.println("  XSLFEventBasedPowerPointExtractor <filename.pptx>");
        System.exit(1);
    }
    XWPFEventBasedWordExtractor extractor = new XWPFEventBasedWordExtractor(args[0]);
    System.out.println(extractor.getText());
    extractor.close();
}
Also used : XWPFEventBasedWordExtractor(org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor)

Example 2 with XWPFEventBasedWordExtractor

use of org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor in project tika by apache.

the class OOXMLExtractorFactory method parse.

public static void parse(InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    Locale locale = context.get(Locale.class, Locale.getDefault());
    ExtractorFactory.setThreadPrefersEventExtractors(true);
    try {
        OOXMLExtractor extractor;
        OPCPackage pkg;
        // Locate or Open the OPCPackage for the file
        TikaInputStream tis = TikaInputStream.cast(stream);
        if (tis != null && tis.getOpenContainer() instanceof OPCPackage) {
            pkg = (OPCPackage) tis.getOpenContainer();
        } else if (tis != null && tis.hasFile()) {
            pkg = OPCPackage.open(tis.getFile().getPath(), PackageAccess.READ);
            tis.setOpenContainer(pkg);
        } else {
            InputStream shield = new CloseShieldInputStream(stream);
            pkg = OPCPackage.open(shield);
        }
        // Get the type, and ensure it's one we handle
        MediaType type = ZipContainerDetector.detectOfficeOpenXML(pkg);
        if (type == null || OOXMLParser.UNSUPPORTED_OOXML_TYPES.contains(type)) {
            // Not a supported type, delegate to Empty Parser
            EmptyParser.INSTANCE.parse(stream, baseHandler, metadata, context);
            return;
        }
        metadata.set(Metadata.CONTENT_TYPE, type.toString());
        // Have the appropriate OOXML text extractor picked
        POIXMLTextExtractor poiExtractor = null;
        // This has already been set by OOXMLParser's call to configure()
        // We can rely on this being non-null.
        OfficeParserConfig config = context.get(OfficeParserConfig.class);
        if (config.getUseSAXDocxExtractor()) {
            poiExtractor = trySXWPF(pkg);
        }
        if (poiExtractor == null && config.getUseSAXPptxExtractor()) {
            poiExtractor = trySXSLF(pkg);
        }
        if (poiExtractor == null) {
            poiExtractor = ExtractorFactory.createExtractor(pkg);
        }
        POIXMLDocument document = poiExtractor.getDocument();
        if (poiExtractor instanceof XSSFBEventBasedExcelExtractor) {
            extractor = new XSSFBExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XSSFEventBasedExcelExtractor) {
            extractor = new XSSFExcelExtractorDecorator(context, poiExtractor, locale);
        } else if (poiExtractor instanceof XWPFEventBasedWordExtractor) {
            extractor = new SXWPFWordExtractorDecorator(metadata, context, (XWPFEventBasedWordExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XWPFEventBasedWordExtractor.class.getCanonicalName());
        } else if (poiExtractor instanceof XSLFEventBasedPowerPointExtractor) {
            extractor = new SXSLFPowerPointExtractorDecorator(metadata, context, (XSLFEventBasedPowerPointExtractor) poiExtractor);
            metadata.add("X-Parsed-By", XSLFEventBasedPowerPointExtractor.class.getCanonicalName());
        } else if (document == null) {
            throw new TikaException("Expecting UserModel based POI OOXML extractor with a document, but none found. " + "The extractor returned was a " + poiExtractor);
        } else if (document instanceof XMLSlideShow) {
            extractor = new XSLFPowerPointExtractorDecorator(context, (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) poiExtractor);
        } else if (document instanceof XWPFDocument) {
            extractor = new XWPFWordExtractorDecorator(context, (XWPFWordExtractor) poiExtractor);
        } else {
            extractor = new POIXMLTextExtractorDecorator(context, poiExtractor);
        }
        // Get the bulk of the metadata first, so that it's accessible during
        //  parsing if desired by the client (see TIKA-1109)
        extractor.getMetadataExtractor().extract(metadata);
        // Extract the text, along with any in-document metadata
        extractor.getXHTML(baseHandler, metadata, context);
    } catch (IllegalArgumentException e) {
        if (e.getMessage() != null && e.getMessage().startsWith("No supported documents found")) {
            throw new TikaException("TIKA-418: RuntimeException while getting content" + " for thmx and xps file types", e);
        } else {
            throw new TikaException("Error creating OOXML extractor", e);
        }
    } catch (InvalidFormatException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (OpenXML4JException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    } catch (XmlException e) {
        throw new TikaException("Error creating OOXML extractor", e);
    }
}
Also used : Locale(java.util.Locale) TikaInputStream(org.apache.tika.io.TikaInputStream) XWPFEventBasedWordExtractor(org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor) InvalidFormatException(org.apache.poi.openxml4j.exceptions.InvalidFormatException) OpenXML4JException(org.apache.poi.openxml4j.exceptions.OpenXML4JException) XSSFEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) MediaType(org.apache.tika.mime.MediaType) XWPFDocument(org.apache.poi.xwpf.usermodel.XWPFDocument) XSLFEventBasedPowerPointExtractor(org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor) TikaException(org.apache.tika.exception.TikaException) XSSFBEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) POIXMLDocument(org.apache.poi.POIXMLDocument) POIXMLTextExtractor(org.apache.poi.POIXMLTextExtractor) XmlException(org.apache.xmlbeans.XmlException) XMLSlideShow(org.apache.poi.xslf.usermodel.XMLSlideShow) OPCPackage(org.apache.poi.openxml4j.opc.OPCPackage) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 3 with XWPFEventBasedWordExtractor

use of org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor in project tika by apache.

the class OOXMLExtractorFactory method trySXWPF.

private static POIXMLTextExtractor trySXWPF(OPCPackage pkg) throws XmlException, OpenXML4JException, IOException {
    PackageRelationshipCollection packageRelationshipCollection = pkg.getRelationshipsByType("http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument");
    if (packageRelationshipCollection.size() == 0) {
        packageRelationshipCollection = pkg.getRelationshipsByType("http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument");
    }
    if (packageRelationshipCollection.size() == 0) {
        return null;
    }
    PackagePart corePart = pkg.getPart(packageRelationshipCollection.getRelationship(0));
    String targetContentType = corePart.getContentType();
    for (XWPFRelation relation : XWPFWordExtractor.SUPPORTED_TYPES) {
        if (targetContentType.equals(relation.getContentType())) {
            return new XWPFEventBasedWordExtractor(pkg);
        }
    }
    return null;
}
Also used : XWPFRelation(org.apache.poi.xwpf.usermodel.XWPFRelation) PackageRelationshipCollection(org.apache.poi.openxml4j.opc.PackageRelationshipCollection) PackagePart(org.apache.poi.openxml4j.opc.PackagePart) XWPFEventBasedWordExtractor(org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor)

Aggregations

XWPFEventBasedWordExtractor (org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor)3 InputStream (java.io.InputStream)1 Locale (java.util.Locale)1 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)1 POIXMLDocument (org.apache.poi.POIXMLDocument)1 POIXMLTextExtractor (org.apache.poi.POIXMLTextExtractor)1 InvalidFormatException (org.apache.poi.openxml4j.exceptions.InvalidFormatException)1 OpenXML4JException (org.apache.poi.openxml4j.exceptions.OpenXML4JException)1 OPCPackage (org.apache.poi.openxml4j.opc.OPCPackage)1 PackagePart (org.apache.poi.openxml4j.opc.PackagePart)1 PackageRelationshipCollection (org.apache.poi.openxml4j.opc.PackageRelationshipCollection)1 XMLSlideShow (org.apache.poi.xslf.usermodel.XMLSlideShow)1 XSSFBEventBasedExcelExtractor (org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor)1 XSSFEventBasedExcelExtractor (org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor)1 XWPFWordExtractor (org.apache.poi.xwpf.extractor.XWPFWordExtractor)1 XWPFDocument (org.apache.poi.xwpf.usermodel.XWPFDocument)1 XWPFRelation (org.apache.poi.xwpf.usermodel.XWPFRelation)1 TikaException (org.apache.tika.exception.TikaException)1 TikaInputStream (org.apache.tika.io.TikaInputStream)1 MediaType (org.apache.tika.mime.MediaType)1