Search in sources :

Example 11 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class TikaParserConfigTest method testParserExcludeFromDefault.

@Test
public void testParserExcludeFromDefault() throws Exception {
    TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
    assertNotNull(config.getParser());
    assertNotNull(config.getDetector());
    CompositeParser parser = (CompositeParser) config.getParser();
    MediaType PE_EXE = MediaType.application("x-msdownload");
    MediaType ELF = MediaType.application("x-elf");
    // Get the DefaultParser from the config
    ParserDecorator confWrappedParser = (ParserDecorator) parser.getParsers().get(MediaType.APPLICATION_XML);
    assertNotNull(confWrappedParser);
    DefaultParser confParser = (DefaultParser) confWrappedParser.getWrappedParser();
    // Get a fresh "default" DefaultParser
    DefaultParser normParser = new DefaultParser(config.getMediaTypeRegistry());
    // The default one will offer the Executable Parser
    assertContains(PE_EXE, normParser.getSupportedTypes(context));
    assertContains(ELF, normParser.getSupportedTypes(context));
    boolean hasExec = false;
    for (Parser p : normParser.getParsers().values()) {
        if (p instanceof ExecutableParser) {
            hasExec = true;
            break;
        }
    }
    assertTrue(hasExec);
    // The one from the config won't
    assertNotContained(PE_EXE, confParser.getSupportedTypes(context));
    assertNotContained(ELF, confParser.getSupportedTypes(context));
    for (Parser p : confParser.getParsers().values()) {
        if (p instanceof ExecutableParser)
            fail("Shouldn't have the Executable Parser from config");
    }
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) MediaType(org.apache.tika.mime.MediaType) ExecutableParser(org.apache.tika.parser.executable.ExecutableParser) DefaultParser(org.apache.tika.parser.DefaultParser) Parser(org.apache.tika.parser.Parser) ExecutableParser(org.apache.tika.parser.executable.ExecutableParser) CompositeParser(org.apache.tika.parser.CompositeParser) XMLParser(org.apache.tika.parser.xml.XMLParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) Test(org.junit.Test)

Example 12 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class TikaCLI method displaySupportedTypes.

/**
     * Prints all the known media types, aliases and matching parser classes.
     */
private void displaySupportedTypes() {
    AutoDetectParser parser = new AutoDetectParser();
    MediaTypeRegistry registry = parser.getMediaTypeRegistry();
    Map<MediaType, Parser> parsers = parser.getParsers();
    for (MediaType type : registry.getTypes()) {
        System.out.println(type);
        for (MediaType alias : registry.getAliases(type)) {
            System.out.println("  alias:     " + alias);
        }
        MediaType supertype = registry.getSupertype(type);
        if (supertype != null) {
            System.out.println("  supertype: " + supertype);
        }
        Parser p = parsers.get(type);
        if (p != null) {
            if (p instanceof CompositeParser) {
                p = ((CompositeParser) p).getParsers().get(type);
            }
            System.out.println("  parser:    " + p.getClass().getName());
        }
    }
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) MediaTypeRegistry(org.apache.tika.mime.MediaTypeRegistry) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser) NetworkParser(org.apache.tika.parser.NetworkParser) ForkParser(org.apache.tika.fork.ForkParser)

Example 13 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class TikaMimeTypes method getMediaTypes.

protected List<MediaTypeDetails> getMediaTypes() {
    MediaTypeRegistry registry = TikaResource.getConfig().getMediaTypeRegistry();
    Map<MediaType, Parser> parsers = ((CompositeParser) TikaResource.getConfig().getParser()).getParsers();
    List<MediaTypeDetails> types = new ArrayList<TikaMimeTypes.MediaTypeDetails>(registry.getTypes().size());
    for (MediaType type : registry.getTypes()) {
        MediaTypeDetails details = new MediaTypeDetails();
        details.type = type;
        details.aliases = registry.getAliases(type).toArray(new MediaType[0]);
        MediaType supertype = registry.getSupertype(type);
        if (supertype != null && !MediaType.OCTET_STREAM.equals(supertype)) {
            details.supertype = supertype;
        }
        Parser p = parsers.get(type);
        if (p != null) {
            if (p instanceof CompositeParser) {
                p = ((CompositeParser) p).getParsers().get(type);
            }
            details.parser = p.getClass().getName();
        }
        types.add(details);
    }
    return types;
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ArrayList(java.util.ArrayList) MediaType(org.apache.tika.mime.MediaType) MediaTypeRegistry(org.apache.tika.mime.MediaTypeRegistry) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser)

Example 14 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class TikaConfigTest method parserWithChildParsers.

/**
     * TIKA-1653 If one parser has child parsers, those child parsers shouldn't
     *  show up at the top level as well
     */
@Test
public void parserWithChildParsers() throws Exception {
    try {
        TikaConfig config = getConfig("TIKA-1653-norepeat.xml");
        CompositeParser cp = (CompositeParser) config.getParser();
        List<Parser> parsers = cp.getAllComponentParsers();
        Parser p;
        // Just 2 top level parsers
        assertEquals(2, parsers.size());
        // Should have a CompositeParser with 2 child ones, and
        //  and a wrapped empty parser
        p = parsers.get(0);
        assertTrue(p.toString(), p instanceof CompositeParser);
        assertEquals(2, ((CompositeParser) p).getAllComponentParsers().size());
        p = parsers.get(1);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(EmptyParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("hello/world", p.getSupportedTypes(null).iterator().next().toString());
    } catch (TikaException e) {
        fail("Unexpected TikaException: " + e);
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) ErrorParser(org.apache.tika.parser.ErrorParser) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Example 15 with CompositeParser

use of org.apache.tika.parser.CompositeParser in project tika by apache.

the class TIAParsingExample method useCompositeParser.

public static void useCompositeParser() throws Exception {
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    ContentHandler handler = new DefaultHandler();
    ParseContext context = new ParseContext();
    Map<MediaType, Parser> parsersByType = new HashMap<MediaType, Parser>();
    parsersByType.put(MediaType.parse("text/html"), new HtmlParser());
    parsersByType.put(MediaType.parse("application/xml"), new XMLParser());
    CompositeParser parser = new CompositeParser();
    parser.setParsers(parsersByType);
    parser.setFallback(new TXTParser());
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "text/html");
    parser.parse(stream, handler, metadata, context);
}
Also used : HashMap(java.util.HashMap) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) CompositeParser(org.apache.tika.parser.CompositeParser) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseContext(org.apache.tika.parser.ParseContext) TXTParser(org.apache.tika.parser.txt.TXTParser) MediaType(org.apache.tika.mime.MediaType) XMLParser(org.apache.tika.parser.xml.XMLParser)

Aggregations

CompositeParser (org.apache.tika.parser.CompositeParser)17 Parser (org.apache.tika.parser.Parser)16 Test (org.junit.Test)10 MediaType (org.apache.tika.mime.MediaType)9 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)9 DefaultParser (org.apache.tika.parser.DefaultParser)7 ParserDecorator (org.apache.tika.parser.ParserDecorator)7 TikaConfig (org.apache.tika.config.TikaConfig)6 EmptyParser (org.apache.tika.parser.EmptyParser)5 XMLParser (org.apache.tika.parser.xml.XMLParser)4 InputStream (java.io.InputStream)3 TikaConfigTest (org.apache.tika.config.TikaConfigTest)3 TikaException (org.apache.tika.exception.TikaException)3 ForkParser (org.apache.tika.fork.ForkParser)3 TikaInputStream (org.apache.tika.io.TikaInputStream)3 ExecutableParser (org.apache.tika.parser.executable.ExecutableParser)3 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)3 TikaTest (org.apache.tika.TikaTest)2 Metadata (org.apache.tika.metadata.Metadata)2 MediaTypeRegistry (org.apache.tika.mime.MediaTypeRegistry)2