Search in sources :

Example 1 with OfficeParserConfig

use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.

the class SXSLFExtractorTest method testMacrosInPptm.

@Test
public void testMacrosInPptm() throws Exception {
    Metadata parsedBy = new Metadata();
    parsedBy.add("X-Parsed-By", "org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor");
    List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm", parseContext);
    //test default is "don't extract macros"
    for (Metadata metadata : metadataList) {
        if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
            fail("Shouldn't have extracted macros as default");
        }
    }
    assertContainsAtLeast(parsedBy, metadataList);
    //now test that they are extracted
    ParseContext context = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    officeParserConfig.setUseSAXPptxExtractor(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    Metadata minExpected = new Metadata();
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
    minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
    minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
    metadataList = getRecursiveMetadata("testPPT_macros.pptm", context);
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
    //test configuring via config file
    TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    metadataList = getRecursiveMetadata("testPPT_macros.pptm", parser);
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 2 with OfficeParserConfig

use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.

the class SXSLFExtractorTest method testPowerPointCustomProperties.

@Test
public void testPowerPointCustomProperties() throws Exception {
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    context.set(Locale.class, Locale.US);
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setUseSAXPptxExtractor(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    getXML("testPPT_custom_props.pptx", metadata, parseContext);
    assertEquals("application/vnd.openxmlformats-officedocument.presentationml.presentation", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("JOUVIN ETIENNE", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("EJ04325S", metadata.get(TikaCoreProperties.MODIFIER));
    assertEquals("EJ04325S", metadata.get(Metadata.LAST_AUTHOR));
    assertEquals("2011-08-22T13:30:53Z", metadata.get(TikaCoreProperties.CREATED));
    assertEquals("2011-08-22T13:30:53Z", metadata.get(Metadata.CREATION_DATE));
    assertEquals("2011-08-22T13:32:49Z", metadata.get(TikaCoreProperties.MODIFIED));
    assertEquals("2011-08-22T13:32:49Z", metadata.get(Metadata.DATE));
    assertEquals("1", metadata.get(Office.SLIDE_COUNT));
    assertEquals("3", metadata.get(Office.WORD_COUNT));
    assertEquals("Test extraction properties pptx", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("true", metadata.get("custom:myCustomBoolean"));
    assertEquals("3", metadata.get("custom:myCustomNumber"));
    assertEquals("MyStringValue", metadata.get("custom:MyCustomString"));
    assertEquals("2010-12-30T22:00:00Z", metadata.get("custom:MyCustomDate"));
    assertEquals("2010-12-29T22:00:00Z", metadata.get("custom:myCustomSecondDate"));
}
Also used : Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 3 with OfficeParserConfig

use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.

the class SXWPFExtractorTest method testEncrypted.

@Test
public void testEncrypted() throws Exception {
    Map<String, String> tests = new HashMap<String, String>();
    tests.put("testWORD_protected_passtika.docx", "This is an encrypted Word 2007 File");
    Parser parser = new AutoDetectParser();
    Metadata m = new Metadata();
    PasswordProvider passwordProvider = new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "tika";
        }
    };
    OfficeParserConfig opc = new OfficeParserConfig();
    opc.setUseSAXDocxExtractor(true);
    ParseContext passwordContext = new ParseContext();
    passwordContext.set(org.apache.tika.parser.PasswordProvider.class, passwordProvider);
    passwordContext.set(OfficeParserConfig.class, opc);
    for (Map.Entry<String, String> e : tests.entrySet()) {
        assertContains(e.getValue(), getXML(e.getKey(), passwordContext).xml);
    }
    //now try with no password
    for (Map.Entry<String, String> e : tests.entrySet()) {
        boolean exc = false;
        try {
            getXML(e.getKey(), parseContext);
        } catch (EncryptedDocumentException ex) {
            exc = true;
        }
        assertTrue(exc);
    }
}
Also used : EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) HashMap(java.util.HashMap) Metadata(org.apache.tika.metadata.Metadata) PasswordProvider(org.apache.tika.parser.PasswordProvider) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) HashMap(java.util.HashMap) Map(java.util.Map) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 4 with OfficeParserConfig

use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.

the class OOXMLParserTest method testMacrosInPptm.

@Test
public void testMacrosInPptm() throws Exception {
    //test default is "don't extract macros"
    for (Metadata metadata : getRecursiveMetadata("testPPT_macros.pptm")) {
        if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
            fail("Shouldn't have extracted macros as default");
        }
    }
    //now test that they were extracted
    ParseContext context = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    Metadata minExpected = new Metadata();
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
    minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
    minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
    assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", context));
    //test configuring via config file
    TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", parser));
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 5 with OfficeParserConfig

use of org.apache.tika.parser.microsoft.OfficeParserConfig in project tika by apache.

the class SXWPFExtractorTest method testMacrosInDocm.

@Test
public void testMacrosInDocm() throws Exception {
    Metadata parsedBy = new Metadata();
    parsedBy.add("X-Parsed-By", "org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor");
    //test default is "don't extract macros"
    List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm", parseContext);
    for (Metadata metadata : metadataList) {
        if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
            fail("Shouldn't have extracted macros as default");
        }
    }
    assertContainsAtLeast(parsedBy, metadataList);
    //now test that they were extracted
    ParseContext context = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    officeParserConfig.setUseSAXDocxExtractor(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    metadataList = getRecursiveMetadata("testWORD_macros.docm", context);
    //check that content came out of the .docm file
    assertContains("quick", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
    assertContainsAtLeast(parsedBy, metadataList);
    Metadata minExpected = new Metadata();
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
    minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
    minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
    //test configuring via config file
    TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    metadataList = getRecursiveMetadata("testWORD_macros.docm", parser);
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

OfficeParserConfig (org.apache.tika.parser.microsoft.OfficeParserConfig)16 ParseContext (org.apache.tika.parser.ParseContext)15 TikaTest (org.apache.tika.TikaTest)13 Test (org.junit.Test)13 Metadata (org.apache.tika.metadata.Metadata)9 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)6 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)6 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)6 TikaConfig (org.apache.tika.config.TikaConfig)5 InputStream (java.io.InputStream)2 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)2 TikaInputStream (org.apache.tika.io.TikaInputStream)2 File (java.io.File)1 Date (java.util.Date)1 HashMap (java.util.HashMap)1 Locale (java.util.Locale)1 Map (java.util.Map)1 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)1 POIXMLDocument (org.apache.poi.POIXMLDocument)1 POIXMLTextExtractor (org.apache.poi.POIXMLTextExtractor)1