Search in sources :

Example 1 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class TikaConfigTest method testUnknownParser.

/**
     * Make sure that with a service loader given, we can
     * get different configurable behaviour on parser classes
     * which can't be found.
     */
@Test
public void testUnknownParser() throws Exception {
    ServiceLoader ignoreLoader = new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.IGNORE);
    ServiceLoader warnLoader = new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.WARN);
    ServiceLoader throwLoader = new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.THROW);
    Path configPath = Paths.get(new URI(getConfigPath("TIKA-1700-unknown-parser.xml")));
    TikaConfig ignore = new TikaConfig(configPath, ignoreLoader);
    assertNotNull(ignore);
    assertNotNull(ignore.getParser());
    assertEquals(1, ((CompositeParser) ignore.getParser()).getAllComponentParsers().size());
    TikaConfig warn = new TikaConfig(configPath, warnLoader);
    assertNotNull(warn);
    assertNotNull(warn.getParser());
    assertEquals(1, ((CompositeParser) warn.getParser()).getAllComponentParsers().size());
    try {
        new TikaConfig(configPath, throwLoader);
        fail("Shouldn't get here, invalid parser class");
    } catch (TikaException expected) {
    }
}
Also used : Path(java.nio.file.Path) TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) URI(java.net.URI) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Example 2 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class TikaConfigTest method defaultParserWithExcludes.

/**
     * TIKA-1445 It should be possible to exclude DefaultParser from
     *  certain types, so another parser explicitly listed will take them
     */
@Test
public void defaultParserWithExcludes() throws Exception {
    try {
        TikaConfig config = getConfig("TIKA-1445-default-except.xml");
        CompositeParser cp = (CompositeParser) config.getParser();
        List<Parser> parsers = cp.getAllComponentParsers();
        Parser p;
        // Will be the three parsers defined in the xml
        assertEquals(3, parsers.size());
        // Should have a wrapped DefaultParser, not the main DefaultParser,
        //  as it is excluded from handling certain classes
        p = parsers.get(0);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(DefaultParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        // Should have two others which claim things, which they wouldn't
        //  otherwise handle
        p = parsers.get(1);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(EmptyParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("hello/world", p.getSupportedTypes(null).iterator().next().toString());
        p = parsers.get(2);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(ErrorParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("fail/world", p.getSupportedTypes(null).iterator().next().toString());
    } catch (TikaException e) {
        fail("Unexpected TikaException: " + e);
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) ErrorParser(org.apache.tika.parser.ErrorParser) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Example 3 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class ExternalParsersFactory method attachExternalParsers.

public static void attachExternalParsers(List<ExternalParser> parsers, TikaConfig config) {
    Parser parser = config.getParser();
    if (parser instanceof CompositeParser) {
        CompositeParser cParser = (CompositeParser) parser;
        Map<MediaType, Parser> parserMap = cParser.getParsers();
    }
// TODO
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) MediaType(org.apache.tika.mime.MediaType) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser)

Example 4 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class TikaParserConfigTest method testMimeExcludeInclude.

@Test
public void testMimeExcludeInclude() throws Exception {
    TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
    assertNotNull(config.getParser());
    assertNotNull(config.getDetector());
    Parser parser = config.getParser();
    MediaType PDF = MediaType.application("pdf");
    MediaType JPEG = MediaType.image("jpeg");
    // Has two parsers
    assertEquals(CompositeParser.class, parser.getClass());
    CompositeParser cParser = (CompositeParser) parser;
    assertEquals(2, cParser.getAllComponentParsers().size());
    // Both are decorated
    assertTrue(cParser.getAllComponentParsers().get(0) instanceof ParserDecorator);
    assertTrue(cParser.getAllComponentParsers().get(1) instanceof ParserDecorator);
    ParserDecorator p0 = (ParserDecorator) cParser.getAllComponentParsers().get(0);
    ParserDecorator p1 = (ParserDecorator) cParser.getAllComponentParsers().get(1);
    // DefaultParser will be wrapped with excludes
    assertEquals(DefaultParser.class, p0.getWrappedParser().getClass());
    assertNotContained(PDF, p0.getSupportedTypes(context));
    assertContains(PDF, p0.getWrappedParser().getSupportedTypes(context));
    assertNotContained(JPEG, p0.getSupportedTypes(context));
    assertContains(JPEG, p0.getWrappedParser().getSupportedTypes(context));
    // Will have an empty parser for PDF
    assertEquals(EmptyParser.class, p1.getWrappedParser().getClass());
    assertEquals(1, p1.getSupportedTypes(context).size());
    assertContains(PDF, p1.getSupportedTypes(context));
    assertNotContained(PDF, p1.getWrappedParser().getSupportedTypes(context));
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) MediaType(org.apache.tika.mime.MediaType) Parser(org.apache.tika.parser.Parser) ExecutableParser(org.apache.tika.parser.executable.ExecutableParser) CompositeParser(org.apache.tika.parser.CompositeParser) XMLParser(org.apache.tika.parser.xml.XMLParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) Test(org.junit.Test)

Example 5 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class TikaParserConfigTest method defaultParserBlacklist.

/**
     * TIKA-1558 It should be possible to exclude Parsers from being picked up by
     * DefaultParser.
     */
@Test
public void defaultParserBlacklist() throws Exception {
    TikaConfig config = new TikaConfig();
    assertNotNull(config.getParser());
    assertNotNull(config.getDetector());
    CompositeParser cp = (CompositeParser) config.getParser();
    List<Parser> parsers = cp.getAllComponentParsers();
    boolean hasXML = false;
    for (Parser p : parsers) {
        if (p instanceof XMLParser) {
            hasXML = true;
            break;
        }
    }
    assertTrue("Default config should include an XMLParser.", hasXML);
    // This custom TikaConfig should exclude XMLParser and all of its subclasses.
    config = getConfig("TIKA-1558-blacklistsub.xml");
    cp = (CompositeParser) config.getParser();
    parsers = cp.getAllComponentParsers();
    for (Parser p : parsers) {
        if (p instanceof XMLParser)
            fail("Custom config should not include an XMLParser (" + p.getClass() + ").");
    }
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) XMLParser(org.apache.tika.parser.xml.XMLParser) Parser(org.apache.tika.parser.Parser) ExecutableParser(org.apache.tika.parser.executable.ExecutableParser) CompositeParser(org.apache.tika.parser.CompositeParser) XMLParser(org.apache.tika.parser.xml.XMLParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) Test(org.junit.Test)

Aggregations

CompositeParser (org.apache.tika.parser.CompositeParser)17 Parser (org.apache.tika.parser.Parser)16 Test (org.junit.Test)10 MediaType (org.apache.tika.mime.MediaType)9 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)9 DefaultParser (org.apache.tika.parser.DefaultParser)7 ParserDecorator (org.apache.tika.parser.ParserDecorator)7 TikaConfig (org.apache.tika.config.TikaConfig)6 EmptyParser (org.apache.tika.parser.EmptyParser)5 XMLParser (org.apache.tika.parser.xml.XMLParser)4 InputStream (java.io.InputStream)3 TikaConfigTest (org.apache.tika.config.TikaConfigTest)3 TikaException (org.apache.tika.exception.TikaException)3 ForkParser (org.apache.tika.fork.ForkParser)3 TikaInputStream (org.apache.tika.io.TikaInputStream)3 ExecutableParser (org.apache.tika.parser.executable.ExecutableParser)3 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)3 TikaTest (org.apache.tika.TikaTest)2 Metadata (org.apache.tika.metadata.Metadata)2 MediaTypeRegistry (org.apache.tika.mime.MediaTypeRegistry)2