Search in sources :

Example 11 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class TikaCLI method configure.

private void configure(String configFilePath) throws Exception {
    this.configFilePath = configFilePath;
    config = new TikaConfig(new File(configFilePath));
    parser = new AutoDetectParser(config);
    if (digester != null) {
        parser = new DigestingParser(parser, digester);
    }
    detector = config.getDetector();
    context.set(Parser.class, parser);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser) File(java.io.File)

Example 12 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class SXSLFExtractorTest method testMacrosInPptm.

@Test
public void testMacrosInPptm() throws Exception {
    Metadata parsedBy = new Metadata();
    parsedBy.add("X-Parsed-By", "org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor");
    List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm", parseContext);
    //test default is "don't extract macros"
    for (Metadata metadata : metadataList) {
        if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
            fail("Shouldn't have extracted macros as default");
        }
    }
    assertContainsAtLeast(parsedBy, metadataList);
    //now test that they are extracted
    ParseContext context = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    officeParserConfig.setUseSAXPptxExtractor(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    Metadata minExpected = new Metadata();
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
    minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
    minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
    metadataList = getRecursiveMetadata("testPPT_macros.pptm", context);
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
    //test configuring via config file
    TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    metadataList = getRecursiveMetadata("testPPT_macros.pptm", parser);
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 13 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class OOXMLParserTest method testMacrosInPptm.

@Test
public void testMacrosInPptm() throws Exception {
    //test default is "don't extract macros"
    for (Metadata metadata : getRecursiveMetadata("testPPT_macros.pptm")) {
        if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
            fail("Shouldn't have extracted macros as default");
        }
    }
    //now test that they were extracted
    ParseContext context = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    Metadata minExpected = new Metadata();
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
    minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
    minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
    assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", context));
    //test configuring via config file
    TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    assertContainsAtLeast(minExpected, getRecursiveMetadata("testPPT_macros.pptm", parser));
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Example 14 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class SXWPFExtractorTest method testMacrosInDocm.

@Test
public void testMacrosInDocm() throws Exception {
    Metadata parsedBy = new Metadata();
    parsedBy.add("X-Parsed-By", "org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor");
    //test default is "don't extract macros"
    List<Metadata> metadataList = getRecursiveMetadata("testWORD_macros.docm", parseContext);
    for (Metadata metadata : metadataList) {
        if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
            fail("Shouldn't have extracted macros as default");
        }
    }
    assertContainsAtLeast(parsedBy, metadataList);
    //now test that they were extracted
    ParseContext context = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    officeParserConfig.setUseSAXDocxExtractor(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    metadataList = getRecursiveMetadata("testWORD_macros.docm", context);
    //check that content came out of the .docm file
    assertContains("quick", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
    assertContainsAtLeast(parsedBy, metadataList);
    Metadata minExpected = new Metadata();
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
    minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
    minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
    //test configuring via config file
    TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    metadataList = getRecursiveMetadata("testWORD_macros.docm", parser);
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 15 with TikaConfig

use of org.apache.tika.config.TikaConfig in project tika by apache.

the class OOXMLParserTest method testMacrosInDocm.

@Test
public void testMacrosInDocm() throws Exception {
    //test default is "don't extract macros"
    for (Metadata metadata : getRecursiveMetadata("testWORD_macros.docm")) {
        if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
            fail("Shouldn't have extracted macros as default");
        }
    }
    //now test that they were extracted
    ParseContext context = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    Metadata minExpected = new Metadata();
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
    minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
    minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
    assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", context));
    //test configuring via config file
    TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-dom-macros.xml"));
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    assertContainsAtLeast(minExpected, getRecursiveMetadata("testWORD_macros.docm", parser));
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ExcelParserTest(org.apache.tika.parser.microsoft.ExcelParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest) WordParserTest(org.apache.tika.parser.microsoft.WordParserTest)

Aggregations

TikaConfig (org.apache.tika.config.TikaConfig)62 Test (org.junit.Test)32 Metadata (org.apache.tika.metadata.Metadata)26 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)20 TikaTest (org.apache.tika.TikaTest)16 InputStream (java.io.InputStream)12 Tika (org.apache.tika.Tika)12 IOException (java.io.IOException)10 URL (java.net.URL)10 TikaException (org.apache.tika.exception.TikaException)9 TikaInputStream (org.apache.tika.io.TikaInputStream)9 ParseContext (org.apache.tika.parser.ParseContext)9 Parser (org.apache.tika.parser.Parser)9 MediaType (org.apache.tika.mime.MediaType)8 CompositeParser (org.apache.tika.parser.CompositeParser)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 File (java.io.File)6 TikaConfigTest (org.apache.tika.config.TikaConfigTest)6 HashSet (java.util.HashSet)5 SAXException (org.xml.sax.SAXException)5