Search in sources :

Example 1 with ParserDecorator

use of org.apache.tika.parser.ParserDecorator in project tika by apache.

the class TikaConfigTest method defaultParserWithExcludes.

/**
     * TIKA-1445 It should be possible to exclude DefaultParser from
     *  certain types, so another parser explicitly listed will take them
     */
@Test
public void defaultParserWithExcludes() throws Exception {
    try {
        TikaConfig config = getConfig("TIKA-1445-default-except.xml");
        CompositeParser cp = (CompositeParser) config.getParser();
        List<Parser> parsers = cp.getAllComponentParsers();
        Parser p;
        // Will be the three parsers defined in the xml
        assertEquals(3, parsers.size());
        // Should have a wrapped DefaultParser, not the main DefaultParser,
        //  as it is excluded from handling certain classes
        p = parsers.get(0);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(DefaultParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        // Should have two others which claim things, which they wouldn't
        //  otherwise handle
        p = parsers.get(1);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(EmptyParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("hello/world", p.getSupportedTypes(null).iterator().next().toString());
        p = parsers.get(2);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(ErrorParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("fail/world", p.getSupportedTypes(null).iterator().next().toString());
    } catch (TikaException e) {
        fail("Unexpected TikaException: " + e);
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) ErrorParser(org.apache.tika.parser.ErrorParser) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Example 2 with ParserDecorator

use of org.apache.tika.parser.ParserDecorator in project tika by apache.

the class TikaParserConfigTest method testMimeExcludeInclude.

@Test
public void testMimeExcludeInclude() throws Exception {
    TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
    assertNotNull(config.getParser());
    assertNotNull(config.getDetector());
    Parser parser = config.getParser();
    MediaType PDF = MediaType.application("pdf");
    MediaType JPEG = MediaType.image("jpeg");
    // Has two parsers
    assertEquals(CompositeParser.class, parser.getClass());
    CompositeParser cParser = (CompositeParser) parser;
    assertEquals(2, cParser.getAllComponentParsers().size());
    // Both are decorated
    assertTrue(cParser.getAllComponentParsers().get(0) instanceof ParserDecorator);
    assertTrue(cParser.getAllComponentParsers().get(1) instanceof ParserDecorator);
    ParserDecorator p0 = (ParserDecorator) cParser.getAllComponentParsers().get(0);
    ParserDecorator p1 = (ParserDecorator) cParser.getAllComponentParsers().get(1);
    // DefaultParser will be wrapped with excludes
    assertEquals(DefaultParser.class, p0.getWrappedParser().getClass());
    assertNotContained(PDF, p0.getSupportedTypes(context));
    assertContains(PDF, p0.getWrappedParser().getSupportedTypes(context));
    assertNotContained(JPEG, p0.getSupportedTypes(context));
    assertContains(JPEG, p0.getWrappedParser().getSupportedTypes(context));
    // Will have an empty parser for PDF
    assertEquals(EmptyParser.class, p1.getWrappedParser().getClass());
    assertEquals(1, p1.getSupportedTypes(context).size());
    assertContains(PDF, p1.getSupportedTypes(context));
    assertNotContained(PDF, p1.getWrappedParser().getSupportedTypes(context));
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) MediaType(org.apache.tika.mime.MediaType) Parser(org.apache.tika.parser.Parser) ExecutableParser(org.apache.tika.parser.executable.ExecutableParser) CompositeParser(org.apache.tika.parser.CompositeParser) XMLParser(org.apache.tika.parser.xml.XMLParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) Test(org.junit.Test)

Example 3 with ParserDecorator

use of org.apache.tika.parser.ParserDecorator in project tika by apache.

the class TikaCLI method displayParser.

private void displayParser(Parser p, boolean includeMimeTypes, boolean apt, int i) {
    String decorated = null;
    if (p instanceof ParserDecorator) {
        ParserDecorator pd = (ParserDecorator) p;
        decorated = " (Wrapped by " + pd.getDecorationName() + ")";
        p = pd.getWrappedParser();
    }
    boolean isComposite = (p instanceof CompositeParser);
    String name = p.getClass().getName();
    if (apt) {
        name = name.substring(0, name.lastIndexOf(".") + 1) + "{{{./api/" + name.replace(".", "/") + "}" + name.substring(name.lastIndexOf(".") + 1) + "}}";
    } else if (decorated != null) {
        name += decorated;
    }
    if ((apt && !isComposite) || !apt) {
        // Don't display Composite parsers in the apt output.
        System.out.println(indent(i) + ((apt) ? "* " : "") + name + (isComposite ? " (Composite Parser):" : ""));
        if (apt)
            System.out.println();
        if (includeMimeTypes && !isComposite) {
            for (MediaType mt : p.getSupportedTypes(context)) {
                System.out.println(indent(i + 3) + ((apt) ? "* " : "") + mt);
                if (apt)
                    System.out.println();
            }
        }
    }
    if (isComposite) {
        Parser[] subParsers = sortParsers(invertMediaTypeMap(((CompositeParser) p).getParsers()));
        for (Parser sp : subParsers) {
            // Don't indent for Composites in apt.
            displayParser(sp, includeMimeTypes, apt, i + ((apt) ? 0 : 3));
        }
    }
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) MediaType(org.apache.tika.mime.MediaType) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser) NetworkParser(org.apache.tika.parser.NetworkParser) ForkParser(org.apache.tika.fork.ForkParser)

Example 4 with ParserDecorator

use of org.apache.tika.parser.ParserDecorator in project tika by apache.

the class EmbeddedDocumentUtil method tryToFindExistingLeafParser.

/**
     * Tries to find an existing parser within the ParseContext.
     * It looks inside of CompositeParsers and ParserDecorators.
     * The use case is when a parser needs to parse an internal stream
     * that is _part_ of the document, e.g. rtf body inside an msg.
     * <p/>
     * Can return <code>null</code> if the context contains no parser or
     * the correct parser can't be found.
     *
     * @param clazz parser class to search for
     * @param context
     * @return
     */
public static Parser tryToFindExistingLeafParser(Class clazz, ParseContext context) {
    Parser p = context.get(Parser.class);
    if (equals(p, clazz)) {
        return p;
    }
    Parser returnParser = null;
    if (p != null) {
        if (p instanceof ParserDecorator) {
            p = ((ParserDecorator) p).getWrappedParser();
        }
        if (equals(p, clazz)) {
            return p;
        }
        if (p instanceof CompositeParser) {
            returnParser = findInComposite((CompositeParser) p, clazz, context);
        }
    }
    if (returnParser != null && equals(returnParser, clazz)) {
        return returnParser;
    }
    return null;
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Example 5 with ParserDecorator

use of org.apache.tika.parser.ParserDecorator in project tika by apache.

the class TikaConfigSerializer method addParser.

private static void addParser(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception {
    // If the parser is decorated, is it a kind where we output the parser inside?
    ParserDecorator decoration = null;
    if (parser instanceof ParserDecorator) {
        if (parser.getClass().getName().startsWith(ParserDecorator.class.getName() + "$")) {
            decoration = ((ParserDecorator) parser);
            parser = decoration.getWrappedParser();
        }
    }
    boolean outputParser = true;
    List<Parser> children = Collections.emptyList();
    if (mode == Mode.CURRENT && parser instanceof DefaultParser) {
    // Only output the parser, not the children
    } else if (parser instanceof CompositeParser) {
        children = ((CompositeParser) parser).getAllComponentParsers();
        // Special case for a naked composite
        if (parser.getClass().equals(CompositeParser.class)) {
            outputParser = false;
        }
        // Special case for making Default to static
        if (parser instanceof DefaultParser && (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) {
            outputParser = false;
        }
    }
    if (outputParser) {
        rootElement = addParser(mode, rootElement, doc, parser, decoration);
    }
    for (Parser childParser : children) {
        addParser(mode, rootElement, doc, childParser);
    }
// TODO Parser Exclusions
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) DefaultParser(org.apache.tika.parser.DefaultParser) DefaultParser(org.apache.tika.parser.DefaultParser)

Aggregations

CompositeParser (org.apache.tika.parser.CompositeParser)8 Parser (org.apache.tika.parser.Parser)8 ParserDecorator (org.apache.tika.parser.ParserDecorator)8 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)5 DefaultParser (org.apache.tika.parser.DefaultParser)5 EmptyParser (org.apache.tika.parser.EmptyParser)4 Test (org.junit.Test)4 TikaException (org.apache.tika.exception.TikaException)3 MediaType (org.apache.tika.mime.MediaType)3 XMLParser (org.apache.tika.parser.xml.XMLParser)3 TikaConfig (org.apache.tika.config.TikaConfig)2 TikaConfigTest (org.apache.tika.config.TikaConfigTest)2 ErrorParser (org.apache.tika.parser.ErrorParser)2 ExecutableParser (org.apache.tika.parser.executable.ExecutableParser)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 FileInputStream (java.io.FileInputStream)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 GZIPInputStream (java.util.zip.GZIPInputStream)1 ForkParser (org.apache.tika.fork.ForkParser)1