Search in sources :

Example 16 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class TikaCLI method configure.

private void configure(String configFilePath) throws Exception {
    this.configFilePath = configFilePath;
    config = new TikaConfig(new File(configFilePath));
    parser = new AutoDetectParser(config);
    if (digester != null) {
        parser = new DigestingParser(parser, digester);
    }
    detector = config.getDetector();
    context.set(Parser.class, parser);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser) File(java.io.File)

Example 17 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class DigestingAutoDetectParserFactory method getParser.

@Override
public Parser getParser(TikaConfig config) {
    Parser p = new AutoDetectParser(config);
    if (digester == null) {
        return p;
    }
    DigestingParser d = new DigestingParser(p, digester);
    return d;
}
Also used : AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser)

Example 18 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class SXSLFExtractorTest method testMacrosInPptm.

@Test
public void testMacrosInPptm() throws Exception {
    Metadata parsedBy = new Metadata();
    parsedBy.add("X-Parsed-By", "org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor");
    List<Metadata> metadataList = getRecursiveMetadata("testPPT_macros.pptm", parseContext);
    //test default is "don't extract macros"
    for (Metadata metadata : metadataList) {
        if (metadata.get(Metadata.CONTENT_TYPE).equals("text/x-vbasic")) {
            fail("Shouldn't have extracted macros as default");
        }
    }
    assertContainsAtLeast(parsedBy, metadataList);
    //now test that they are extracted
    ParseContext context = new ParseContext();
    OfficeParserConfig officeParserConfig = new OfficeParserConfig();
    officeParserConfig.setExtractMacros(true);
    officeParserConfig.setUseSAXPptxExtractor(true);
    context.set(OfficeParserConfig.class, officeParserConfig);
    Metadata minExpected = new Metadata();
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Embolden()");
    minExpected.add(RecursiveParserWrapper.TIKA_CONTENT.getName(), "Sub Italicize()");
    minExpected.add(Metadata.CONTENT_TYPE, "text/x-vbasic");
    minExpected.add(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
    metadataList = getRecursiveMetadata("testPPT_macros.pptm", context);
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
    //test configuring via config file
    TikaConfig tikaConfig = new TikaConfig(this.getClass().getResourceAsStream("tika-config-sax-macros.xml"));
    AutoDetectParser parser = new AutoDetectParser(tikaConfig);
    metadataList = getRecursiveMetadata("testPPT_macros.pptm", parser);
    assertContainsAtLeast(minExpected, metadataList);
    assertContainsAtLeast(parsedBy, metadataList);
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) OfficeParserConfig(org.apache.tika.parser.microsoft.OfficeParserConfig) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 19 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class SXSLFExtractorTest method testPowerPoint.

/**
     * We have a number of different powerpoint files,
     * such as presentation, macro-enabled etc
     */
@Test
public void testPowerPoint() throws Exception {
    String[] extensions = new String[] { "pptx", "pptm", "ppsm", "ppsx", "potm" };
    String[] mimeTypes = new String[] { "application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.ms-powerpoint.presentation.macroenabled.12", "application/vnd.ms-powerpoint.slideshow.macroenabled.12", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.ms-powerpoint.template.macroenabled.12" };
    for (int i = 0; i < extensions.length; i++) {
        String extension = extensions[i];
        String filename = "testPPT." + extension;
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        try (InputStream input = getResourceAsStream("/test-documents/" + filename)) {
            parser.parse(input, handler, metadata, parseContext);
            assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
            assertEquals("Attachment Test", metadata.get(TikaCoreProperties.TITLE));
            assertEquals("Rajiv", metadata.get(TikaCoreProperties.CREATOR));
            assertEquals("Rajiv", metadata.get(Metadata.AUTHOR));
            String content = handler.toString();
            // Theme files don't have the text in them
            if (extension.equals("thmx")) {
                assertEquals("", content);
            } else {
                assertTrue("Text missing for " + filename + "\n" + content, content.contains("Attachment Test"));
                assertTrue("Text missing for " + filename + "\n" + content, content.contains("This is a test file data with the same content"));
                assertTrue("Text missing for " + filename + "\n" + content, content.contains("content parsing"));
                assertTrue("Text missing for " + filename + "\n" + content, content.contains("Different words to test against"));
                assertTrue("Text missing for " + filename + "\n" + content, content.contains("Mystery"));
            }
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 20 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class SXSLFExtractorTest method testUnsupportedPowerPoint.

/**
     * For the PowerPoint formats we don't currently support, ensure that
     * we don't break either
     */
@Test
public void testUnsupportedPowerPoint() throws Exception {
    String[] extensions = new String[] { "xps", "thmx" };
    String[] mimeTypes = new String[] { "application/vnd.ms-xpsdocument", // Is this right?
    "application/vnd.openxmlformats-officedocument" };
    for (int i = 0; i < extensions.length; i++) {
        String extension = extensions[i];
        String filename = "testPPT." + extension;
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
        ContentHandler handler = new BodyContentHandler();
        try (InputStream input = getResourceAsStream("/test-documents/" + filename)) {
            parser.parse(input, handler, metadata, parseContext);
            // Should get the metadata
            assertEquals("Mime-type checking for " + filename, mimeTypes[i], metadata.get(Metadata.CONTENT_TYPE));
        // But that's about it
        }
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

AutoDetectParser (org.apache.tika.parser.AutoDetectParser)167 Metadata (org.apache.tika.metadata.Metadata)139 Test (org.junit.Test)122 InputStream (java.io.InputStream)117 Parser (org.apache.tika.parser.Parser)112 ParseContext (org.apache.tika.parser.ParseContext)104 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)97 ContentHandler (org.xml.sax.ContentHandler)91 TikaTest (org.apache.tika.TikaTest)82 TikaInputStream (org.apache.tika.io.TikaInputStream)63 ByteArrayInputStream (java.io.ByteArrayInputStream)34 CompositeParser (org.apache.tika.parser.CompositeParser)28 TikaConfig (org.apache.tika.config.TikaConfig)18 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)17 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)17 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)15 TikaException (org.apache.tika.exception.TikaException)13 EmptyParser (org.apache.tika.parser.EmptyParser)13 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 DefaultHandler (org.xml.sax.helpers.DefaultHandler)12