Search in sources :

Example 6 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class TikaCLI method displayParser.

private void displayParser(Parser p, boolean includeMimeTypes, boolean apt, int i) {
    String decorated = null;
    if (p instanceof ParserDecorator) {
        ParserDecorator pd = (ParserDecorator) p;
        decorated = " (Wrapped by " + pd.getDecorationName() + ")";
        p = pd.getWrappedParser();
    }
    boolean isComposite = (p instanceof CompositeParser);
    String name = p.getClass().getName();
    if (apt) {
        name = name.substring(0, name.lastIndexOf(".") + 1) + "{{{./api/" + name.replace(".", "/") + "}" + name.substring(name.lastIndexOf(".") + 1) + "}}";
    } else if (decorated != null) {
        name += decorated;
    }
    if ((apt && !isComposite) || !apt) {
        // Don't display Composite parsers in the apt output.
        System.out.println(indent(i) + ((apt) ? "* " : "") + name + (isComposite ? " (Composite Parser):" : ""));
        if (apt)
            System.out.println();
        if (includeMimeTypes && !isComposite) {
            for (MediaType mt : p.getSupportedTypes(context)) {
                System.out.println(indent(i + 3) + ((apt) ? "* " : "") + mt);
                if (apt)
                    System.out.println();
            }
        }
    }
    if (isComposite) {
        Parser[] subParsers = sortParsers(invertMediaTypeMap(((CompositeParser) p).getParsers()));
        for (Parser sp : subParsers) {
            // Don't indent for Composites in apt.
            displayParser(sp, includeMimeTypes, apt, i + ((apt) ? 0 : 3));
        }
    }
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) MediaType(org.apache.tika.mime.MediaType) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser) NetworkParser(org.apache.tika.parser.NetworkParser) ForkParser(org.apache.tika.fork.ForkParser)

Example 7 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class BundleIT method testBundleParsers.

@Test
public void testBundleParsers() throws Exception {
    // Get the classes found within OSGi
    ServiceReference<Parser> parserRef = bc.getServiceReference(Parser.class);
    DefaultParser parserService = (DefaultParser) bc.getService(parserRef);
    Set<String> osgiParsers = new HashSet<>();
    for (Parser p : parserService.getAllComponentParsers()) {
        osgiParsers.add(p.getClass().getName());
    }
    // Check we did get a few, just in case...
    assertTrue("Should have lots Parser names, found " + osgiParsers.size(), osgiParsers.size() > 15);
    // Get the raw parsers list from the traditional service loading mechanism
    CompositeParser parser = (CompositeParser) defaultParser;
    Set<String> rawParsers = new HashSet<>();
    for (Parser p : parser.getAllComponentParsers()) {
        if (p instanceof DefaultParser) {
            for (Parser pChild : ((DefaultParser) p).getAllComponentParsers()) {
                rawParsers.add(pChild.getClass().getName());
            }
        } else {
            rawParsers.add(p.getClass().getName());
        }
    }
    assertEquals(rawParsers, osgiParsers);
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) DefaultParser(org.apache.tika.parser.DefaultParser) ForkParser(org.apache.tika.fork.ForkParser) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) DefaultParser(org.apache.tika.parser.DefaultParser) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 8 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class EmbeddedDocumentUtil method tryToFindExistingLeafParser.

/**
     * Tries to find an existing parser within the ParseContext.
     * It looks inside of CompositeParsers and ParserDecorators.
     * The use case is when a parser needs to parse an internal stream
     * that is _part_ of the document, e.g. rtf body inside an msg.
     * <p/>
     * Can return <code>null</code> if the context contains no parser or
     * the correct parser can't be found.
     *
     * @param clazz parser class to search for
     * @param context
     * @return
     */
public static Parser tryToFindExistingLeafParser(Class clazz, ParseContext context) {
    Parser p = context.get(Parser.class);
    if (equals(p, clazz)) {
        return p;
    }
    Parser returnParser = null;
    if (p != null) {
        if (p instanceof ParserDecorator) {
            p = ((ParserDecorator) p).getWrappedParser();
        }
        if (equals(p, clazz)) {
            return p;
        }
        if (p instanceof CompositeParser) {
            returnParser = findInComposite((CompositeParser) p, clazz, context);
        }
    }
    if (returnParser != null && equals(returnParser, clazz)) {
        return returnParser;
    }
    return null;
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Example 9 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class DumpTikaConfigExampleTest method testDump.

@Test
public void testDump() throws Exception {
    DumpTikaConfigExample ex = new DumpTikaConfigExample();
    for (Charset charset : new Charset[] { UTF_8, UTF_16LE }) {
        for (TikaConfigSerializer.Mode mode : TikaConfigSerializer.Mode.values()) {
            Writer writer = new OutputStreamWriter(new FileOutputStream(configFile), charset);
            TikaConfigSerializer.serialize(TikaConfig.getDefaultConfig(), mode, writer, charset);
            writer.flush();
            writer.close();
            TikaConfig c = new TikaConfig(configFile);
            assertTrue(c.getParser().toString(), c.getParser() instanceof CompositeParser);
            assertTrue(c.getDetector().toString(), c.getDetector() instanceof CompositeDetector);
            CompositeParser p = (CompositeParser) c.getParser();
            assertTrue("enough parsers?", p.getParsers().size() > 130);
            CompositeDetector d = (CompositeDetector) c.getDetector();
            assertTrue("enough detectors?", d.getDetectors().size() > 3);
            //just try to load it into autodetect to make sure no errors are thrown
            Parser auto = new AutoDetectParser(c);
            assertNotNull(auto);
        }
    }
}
Also used : CompositeDetector(org.apache.tika.detect.CompositeDetector) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) FileOutputStream(java.io.FileOutputStream) Charset(java.nio.charset.Charset) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) OutputStreamWriter(java.io.OutputStreamWriter) TikaConfigSerializer(org.apache.tika.config.TikaConfigSerializer) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 10 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class TikaConfigSerializer method addParser.

private static void addParser(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception {
    // If the parser is decorated, is it a kind where we output the parser inside?
    ParserDecorator decoration = null;
    if (parser instanceof ParserDecorator) {
        if (parser.getClass().getName().startsWith(ParserDecorator.class.getName() + "$")) {
            decoration = ((ParserDecorator) parser);
            parser = decoration.getWrappedParser();
        }
    }
    boolean outputParser = true;
    List<Parser> children = Collections.emptyList();
    if (mode == Mode.CURRENT && parser instanceof DefaultParser) {
    // Only output the parser, not the children
    } else if (parser instanceof CompositeParser) {
        children = ((CompositeParser) parser).getAllComponentParsers();
        // Special case for a naked composite
        if (parser.getClass().equals(CompositeParser.class)) {
            outputParser = false;
        }
        // Special case for making Default to static
        if (parser instanceof DefaultParser && (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) {
            outputParser = false;
        }
    }
    if (outputParser) {
        rootElement = addParser(mode, rootElement, doc, parser, decoration);
    }
    for (Parser childParser : children) {
        addParser(mode, rootElement, doc, childParser);
    }
// TODO Parser Exclusions
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) DefaultParser(org.apache.tika.parser.DefaultParser) DefaultParser(org.apache.tika.parser.DefaultParser)

Aggregations

CompositeParser (org.apache.tika.parser.CompositeParser)17 Parser (org.apache.tika.parser.Parser)16 Test (org.junit.Test)10 MediaType (org.apache.tika.mime.MediaType)9 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)9 DefaultParser (org.apache.tika.parser.DefaultParser)7 ParserDecorator (org.apache.tika.parser.ParserDecorator)7 TikaConfig (org.apache.tika.config.TikaConfig)6 EmptyParser (org.apache.tika.parser.EmptyParser)5 XMLParser (org.apache.tika.parser.xml.XMLParser)4 InputStream (java.io.InputStream)3 TikaConfigTest (org.apache.tika.config.TikaConfigTest)3 TikaException (org.apache.tika.exception.TikaException)3 ForkParser (org.apache.tika.fork.ForkParser)3 TikaInputStream (org.apache.tika.io.TikaInputStream)3 ExecutableParser (org.apache.tika.parser.executable.ExecutableParser)3 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)3 TikaTest (org.apache.tika.TikaTest)2 Metadata (org.apache.tika.metadata.Metadata)2 MediaTypeRegistry (org.apache.tika.mime.MediaTypeRegistry)2