Search in sources :

Example 6 with ParserDecorator

use of org.apache.tika.parser.ParserDecorator in project tika by apache.

the class TIAParsingExample method testCompositeDocument.

public static void testCompositeDocument() throws Exception {
    InputStream stream = new ByteArrayInputStream(new byte[0]);
    ContentHandler handler = new DefaultHandler();
    Metadata metadata = new Metadata();
    Parser parser = new AutoDetectParser();
    ParseContext context = new ParseContext();
    context.set(Parser.class, new ParserDecorator(parser) {

        private static final long serialVersionUID = 4424210691523343833L;

        @Override
        public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
        // custom processing of the component document
        }
    });
    parser.parse(stream, handler, metadata, context);
}
Also used : TikaException(org.apache.tika.exception.TikaException) GZIPInputStream(java.util.zip.GZIPInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) ParserDecorator(org.apache.tika.parser.ParserDecorator) Metadata(org.apache.tika.metadata.Metadata) IOException(java.io.IOException) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) LinkContentHandler(org.apache.tika.sax.LinkContentHandler) TeeContentHandler(org.apache.tika.sax.TeeContentHandler) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Parser(org.apache.tika.parser.Parser) XMLParser(org.apache.tika.parser.xml.XMLParser) HtmlParser(org.apache.tika.parser.html.HtmlParser) TXTParser(org.apache.tika.parser.txt.TXTParser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) SAXException(org.xml.sax.SAXException) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Example 7 with ParserDecorator

use of org.apache.tika.parser.ParserDecorator in project tika by apache.

the class TikaConfigSerializer method addParser.

private static void addParser(Mode mode, Element rootElement, Document doc, Parser parser) throws Exception {
    // If the parser is decorated, is it a kind where we output the parser inside?
    ParserDecorator decoration = null;
    if (parser instanceof ParserDecorator) {
        if (parser.getClass().getName().startsWith(ParserDecorator.class.getName() + "$")) {
            decoration = ((ParserDecorator) parser);
            parser = decoration.getWrappedParser();
        }
    }
    boolean outputParser = true;
    List<Parser> children = Collections.emptyList();
    if (mode == Mode.CURRENT && parser instanceof DefaultParser) {
    // Only output the parser, not the children
    } else if (parser instanceof CompositeParser) {
        children = ((CompositeParser) parser).getAllComponentParsers();
        // Special case for a naked composite
        if (parser.getClass().equals(CompositeParser.class)) {
            outputParser = false;
        }
        // Special case for making Default to static
        if (parser instanceof DefaultParser && (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) {
            outputParser = false;
        }
    }
    if (outputParser) {
        rootElement = addParser(mode, rootElement, doc, parser, decoration);
    }
    for (Parser childParser : children) {
        addParser(mode, rootElement, doc, childParser);
    }
// TODO Parser Exclusions
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) DefaultParser(org.apache.tika.parser.DefaultParser) DefaultParser(org.apache.tika.parser.DefaultParser)

Example 8 with ParserDecorator

use of org.apache.tika.parser.ParserDecorator in project tika by apache.

the class TikaParserConfigTest method testParserExcludeFromDefault.

@Test
public void testParserExcludeFromDefault() throws Exception {
    TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
    assertNotNull(config.getParser());
    assertNotNull(config.getDetector());
    CompositeParser parser = (CompositeParser) config.getParser();
    MediaType PE_EXE = MediaType.application("x-msdownload");
    MediaType ELF = MediaType.application("x-elf");
    // Get the DefaultParser from the config
    ParserDecorator confWrappedParser = (ParserDecorator) parser.getParsers().get(MediaType.APPLICATION_XML);
    assertNotNull(confWrappedParser);
    DefaultParser confParser = (DefaultParser) confWrappedParser.getWrappedParser();
    // Get a fresh "default" DefaultParser
    DefaultParser normParser = new DefaultParser(config.getMediaTypeRegistry());
    // The default one will offer the Executable Parser
    assertContains(PE_EXE, normParser.getSupportedTypes(context));
    assertContains(ELF, normParser.getSupportedTypes(context));
    boolean hasExec = false;
    for (Parser p : normParser.getParsers().values()) {
        if (p instanceof ExecutableParser) {
            hasExec = true;
            break;
        }
    }
    assertTrue(hasExec);
    // The one from the config won't
    assertNotContained(PE_EXE, confParser.getSupportedTypes(context));
    assertNotContained(ELF, confParser.getSupportedTypes(context));
    for (Parser p : confParser.getParsers().values()) {
        if (p instanceof ExecutableParser)
            fail("Shouldn't have the Executable Parser from config");
    }
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) MediaType(org.apache.tika.mime.MediaType) ExecutableParser(org.apache.tika.parser.executable.ExecutableParser) DefaultParser(org.apache.tika.parser.DefaultParser) Parser(org.apache.tika.parser.Parser) ExecutableParser(org.apache.tika.parser.executable.ExecutableParser) CompositeParser(org.apache.tika.parser.CompositeParser) XMLParser(org.apache.tika.parser.xml.XMLParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) Test(org.junit.Test)

Aggregations

CompositeParser (org.apache.tika.parser.CompositeParser)8 Parser (org.apache.tika.parser.Parser)8 ParserDecorator (org.apache.tika.parser.ParserDecorator)8 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)5 DefaultParser (org.apache.tika.parser.DefaultParser)5 EmptyParser (org.apache.tika.parser.EmptyParser)4 Test (org.junit.Test)4 TikaException (org.apache.tika.exception.TikaException)3 MediaType (org.apache.tika.mime.MediaType)3 XMLParser (org.apache.tika.parser.xml.XMLParser)3 TikaConfig (org.apache.tika.config.TikaConfig)2 TikaConfigTest (org.apache.tika.config.TikaConfigTest)2 ErrorParser (org.apache.tika.parser.ErrorParser)2 ExecutableParser (org.apache.tika.parser.executable.ExecutableParser)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 FileInputStream (java.io.FileInputStream)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 GZIPInputStream (java.util.zip.GZIPInputStream)1 ForkParser (org.apache.tika.fork.ForkParser)1