Search in sources :

Example 1 with OOXMLParser

use of org.apache.tika.parser.microsoft.ooxml.OOXMLParser in project tika by apache.

the class TikaParsersTest method testGetHTML.

@Test
public void testGetHTML() throws Exception {
    for (boolean details : new boolean[] { false, true }) {
        Response response = WebClient.create(endPoint + getPath(details)).type("text/html").accept("text/html").get();
        String text = getStringFromInputStream((InputStream) response.getEntity());
        assertContains("<h2>DefaultParser</h2>", text);
        assertContains("Composite", text);
        assertContains("<h3>OpusParser", text);
        assertContains("<h3>PackageParser", text);
        assertContains("<h3>OOXMLParser", text);
        assertContains(OpusParser.class.getName(), text);
        assertContains(PackageParser.class.getName(), text);
        assertContains(OOXMLParser.class.getName(), text);
        if (details) {
            // Should have the mimetypes they handle
            assertContains("<li>text/plain", text);
            assertContains("<li>application/pdf", text);
            assertContains("<li>audio/ogg", text);
        } else {
            // Shouldn't do
            assertNotFound("text/plain", text);
            assertNotFound("application/pdf", text);
            assertNotFound("audio/ogg", text);
        }
    }
}
Also used : Response(javax.ws.rs.core.Response) OOXMLParser(org.apache.tika.parser.microsoft.ooxml.OOXMLParser) PackageParser(org.apache.tika.parser.pkg.PackageParser) OpusParser(org.gagravarr.tika.OpusParser) Test(org.junit.Test)

Example 2 with OOXMLParser

use of org.apache.tika.parser.microsoft.ooxml.OOXMLParser in project tika by apache.

the class TikaToXMP method initialize.

/**
     * Initializes the map with supported converters.
     */
private static void initialize() {
    // No particular parsing context is needed
    ParseContext parseContext = new ParseContext();
    // MS Office Binary File Format
    addConverter(new OfficeParser().getSupportedTypes(parseContext), MSOfficeBinaryConverter.class);
    // Rich Text Format
    addConverter(new RTFParser().getSupportedTypes(parseContext), RTFConverter.class);
    // MS Open XML Format
    addConverter(new OOXMLParser().getSupportedTypes(parseContext), MSOfficeXMLConverter.class);
    // Open document format
    addConverter(new OpenDocumentParser().getSupportedTypes(parseContext), OpenDocumentConverter.class);
}
Also used : RTFParser(org.apache.tika.parser.rtf.RTFParser) OOXMLParser(org.apache.tika.parser.microsoft.ooxml.OOXMLParser) OpenDocumentParser(org.apache.tika.parser.odf.OpenDocumentParser) OfficeParser(org.apache.tika.parser.microsoft.OfficeParser) ParseContext(org.apache.tika.parser.ParseContext)

Example 3 with OOXMLParser

use of org.apache.tika.parser.microsoft.ooxml.OOXMLParser in project tika by apache.

the class ExcelParserTest method testExcel95.

/**
     * Excel 5 and 95 are older formats, and only get basic support
     */
@Test
public void testExcel95() throws Exception {
    Detector detector = new DefaultDetector();
    AutoDetectParser parser = new AutoDetectParser();
    MediaType type;
    Metadata m;
    // First try detection of Excel 5
    m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
        type = detector.detect(input, m);
        assertEquals("application/vnd.ms-excel", type.toString());
    }
    // Now Excel 95
    m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
        type = detector.detect(input, m);
        assertEquals("application/vnd.ms-excel", type.toString());
    }
    // OfficeParser can handle it
    assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));
    // OOXMLParser won't handle it
    assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));
    // Parse the Excel 5 file
    m = new Metadata();
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) {
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        parser.parse(input, handler, m, context);
        String content = handler.toString();
        // Sheet names
        assertContains("Feuil1", content);
        assertContains("Feuil3", content);
        // Text
        assertContains("Sample Excel", content);
        assertContains("Number", content);
        // Numbers
        assertContains("15", content);
        assertContains("225", content);
        // Metadata was also fetched
        assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE));
        assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
    }
    // Parse the Excel 95 file
    m = new Metadata();
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) {
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        parser.parse(input, handler, m, context);
        String content = handler.toString();
        // Sheet name
        assertContains("Foglio1", content);
        // Very boring file, no actual text or numbers!
        // Metadata was also fetched
        assertEquals(null, m.get(TikaCoreProperties.TITLE));
        assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
    }
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) OOXMLParser(org.apache.tika.parser.microsoft.ooxml.OOXMLParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Detector(org.apache.tika.detect.Detector) DefaultDetector(org.apache.tika.detect.DefaultDetector) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 4 with OOXMLParser

use of org.apache.tika.parser.microsoft.ooxml.OOXMLParser in project tika by apache.

the class OfficeParser method parse.

protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml) throws IOException, SAXException, TikaException {
    // Parse summary entries first, to make metadata available early
    new SummaryExtractor(metadata).parseSummaries(root);
    // Parse remaining document entries
    POIFSDocumentType type = POIFSDocumentType.detectType(root);
    if (type != POIFSDocumentType.UNKNOWN) {
        setType(metadata, type.getType());
    }
    switch(type) {
        case SOLIDWORKS_PART:
        case SOLIDWORKS_ASSEMBLY:
        case SOLIDWORKS_DRAWING:
            break;
        case PUBLISHER:
            PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root);
            xhtml.element("p", publisherTextExtractor.getText());
            break;
        case WORDDOCUMENT:
            new WordExtractor(context, metadata).parse(root, xhtml);
            break;
        case POWERPOINT:
            new HSLFExtractor(context, metadata).parse(root, xhtml);
            break;
        case WORKBOOK:
        case XLR:
            Locale locale = context.get(Locale.class, Locale.getDefault());
            new ExcelExtractor(context, metadata).parse(root, xhtml, locale);
            break;
        case PROJECT:
            // We currently can't do anything beyond the metadata
            break;
        case VISIO:
            VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root);
            for (String text : visioTextExtractor.getAllText()) {
                xhtml.element("p", text);
            }
            break;
        case OUTLOOK:
            OutlookExtractor extractor = new OutlookExtractor(root, context);
            extractor.parse(xhtml, metadata);
            break;
        case ENCRYPTED:
            EncryptionInfo info = new EncryptionInfo(root);
            Decryptor d = Decryptor.getInstance(info);
            try {
                // By default, use the default Office Password
                String password = Decryptor.DEFAULT_PASSWORD;
                // If they supplied a Password Provider, ask that for the password,
                //  and use the provider given one if available (stick with default if not)
                PasswordProvider passwordProvider = context.get(PasswordProvider.class);
                if (passwordProvider != null) {
                    String suppliedPassword = passwordProvider.getPassword(metadata);
                    if (suppliedPassword != null) {
                        password = suppliedPassword;
                    }
                }
                // Check if we've the right password or not
                if (!d.verifyPassword(password)) {
                    throw new EncryptedDocumentException();
                }
                // Decrypt the OLE2 stream, and delegate the resulting OOXML
                //  file to the regular OOXML parser for normal handling
                OOXMLParser parser = new OOXMLParser();
                parser.parse(d.getDataStream(root), new EmbeddedContentHandler(new BodyContentHandler(xhtml)), metadata, context);
            } catch (GeneralSecurityException ex) {
                throw new EncryptedDocumentException(ex);
            }
        default:
            //  is extracted, which happened above
            break;
    }
}
Also used : Locale(java.util.Locale) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Decryptor(org.apache.poi.poifs.crypt.Decryptor) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) EncryptionInfo(org.apache.poi.poifs.crypt.EncryptionInfo) GeneralSecurityException(java.security.GeneralSecurityException) PublisherTextExtractor(org.apache.poi.hpbf.extractor.PublisherTextExtractor) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) PasswordProvider(org.apache.tika.parser.PasswordProvider) OOXMLParser(org.apache.tika.parser.microsoft.ooxml.OOXMLParser) VisioTextExtractor(org.apache.poi.hdgf.extractor.VisioTextExtractor)

Aggregations

OOXMLParser (org.apache.tika.parser.microsoft.ooxml.OOXMLParser)4 ParseContext (org.apache.tika.parser.ParseContext)2 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)2 Test (org.junit.Test)2 InputStream (java.io.InputStream)1 GeneralSecurityException (java.security.GeneralSecurityException)1 Locale (java.util.Locale)1 Response (javax.ws.rs.core.Response)1 VisioTextExtractor (org.apache.poi.hdgf.extractor.VisioTextExtractor)1 PublisherTextExtractor (org.apache.poi.hpbf.extractor.PublisherTextExtractor)1 Decryptor (org.apache.poi.poifs.crypt.Decryptor)1 EncryptionInfo (org.apache.poi.poifs.crypt.EncryptionInfo)1 TikaTest (org.apache.tika.TikaTest)1 DefaultDetector (org.apache.tika.detect.DefaultDetector)1 Detector (org.apache.tika.detect.Detector)1 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)1 Metadata (org.apache.tika.metadata.Metadata)1 MediaType (org.apache.tika.mime.MediaType)1 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)1 PasswordProvider (org.apache.tika.parser.PasswordProvider)1