Search in sources :

Example 1 with DefaultParser

use of org.apache.tika.parser.DefaultParser in project lucene-solr by apache.

the class ExtractingDocumentLoader method load.

@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
        //Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
        MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
        parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
        parser = autoDetectParser;
    }
    if (parser != null) {
        Metadata metadata = new Metadata();
        // If you specify the resource name (the filename, roughly) with this parameter,
        // then Tika can make use of it in guessing the appropriate MIME type:
        String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
        if (resourceName != null) {
            metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
        }
        // Provide stream's content type as hint for auto detection
        if (stream.getContentType() != null) {
            metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
        }
        InputStream inputStream = null;
        try {
            inputStream = stream.getStream();
            metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
            metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
            metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
            metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
            // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
            String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
            if (charset != null) {
                metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
            }
            String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
            boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
            SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
            ContentHandler parsingHandler = handler;
            StringWriter writer = null;
            BaseMarkupSerializer serializer = null;
            if (extractOnly == true) {
                String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
                writer = new StringWriter();
                if (extractFormat.equals(TEXT_FORMAT)) {
                    serializer = new TextSerializer();
                    serializer.setOutputCharStream(writer);
                    serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
                } else {
                    serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
                }
                if (xpathExpr != null) {
                    Matcher matcher = PARSER.parse(xpathExpr);
                    //The MatchingContentHandler does not invoke startDocument.  See http://tika.markmail.org/message/kknu3hw7argwiqin
                    serializer.startDocument();
                    parsingHandler = new MatchingContentHandler(serializer, matcher);
                } else {
                    parsingHandler = serializer;
                }
            } else if (xpathExpr != null) {
                Matcher matcher = PARSER.parse(xpathExpr);
                parsingHandler = new MatchingContentHandler(handler, matcher);
            }
            try {
                //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
                ParseContext context = parseContextConfig.create();
                context.set(Parser.class, parser);
                context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
                // Password handling
                RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
                String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
                if (pwMapFile != null && pwMapFile.length() > 0) {
                    InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
                    if (is != null) {
                        log.debug("Password file supplied: " + pwMapFile);
                        epp.parse(is);
                    }
                }
                context.set(PasswordProvider.class, epp);
                String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
                if (resourcePassword != null) {
                    epp.setExplicitPassword(resourcePassword);
                    log.debug("Literal password supplied for file " + resourceName);
                }
                parser.parse(inputStream, parsingHandler, metadata, context);
            } catch (TikaException e) {
                if (ignoreTikaException)
                    log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage()).append(". metadata=").append(metadata.toString()).toString());
                else
                    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
            }
            if (extractOnly == false) {
                addDoc(handler);
            } else {
                //serializer is not null, so we need to call endDoc on it if using xpath
                if (xpathExpr != null) {
                    serializer.endDocument();
                }
                rsp.add(stream.getName(), writer.toString());
                writer.close();
                String[] names = metadata.names();
                NamedList metadataNL = new NamedList();
                for (int i = 0; i < names.length; i++) {
                    String[] vals = metadata.getValues(names[i]);
                    metadataNL.add(names[i], vals);
                }
                rsp.add(stream.getName() + "_metadata", metadataNL);
            }
        } catch (SAXException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        } finally {
            IOUtils.closeQuietly(inputStream);
        }
    } else {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers.  Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
    }
}
Also used : Matcher(org.apache.tika.sax.xpath.Matcher) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) Metadata(org.apache.tika.metadata.Metadata) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) SAXException(org.xml.sax.SAXException) StringWriter(java.io.StringWriter) MediaType(org.apache.tika.mime.MediaType) SolrException(org.apache.solr.common.SolrException) DefaultParser(org.apache.tika.parser.DefaultParser) XMLSerializer(org.apache.xml.serialize.XMLSerializer) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) NamedList(org.apache.solr.common.util.NamedList) BaseMarkupSerializer(org.apache.xml.serialize.BaseMarkupSerializer) OutputFormat(org.apache.xml.serialize.OutputFormat) Parser(org.apache.tika.parser.Parser) XPathParser(org.apache.tika.sax.xpath.XPathParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) TextSerializer(org.apache.xml.serialize.TextSerializer) ParseContext(org.apache.tika.parser.ParseContext)

Example 2 with DefaultParser

use of org.apache.tika.parser.DefaultParser in project tika by apache.

the class TikaConfigTest method defaultParserWithExcludes.

/**
     * TIKA-1445 It should be possible to exclude DefaultParser from
     *  certain types, so another parser explicitly listed will take them
     */
@Test
public void defaultParserWithExcludes() throws Exception {
    try {
        TikaConfig config = getConfig("TIKA-1445-default-except.xml");
        CompositeParser cp = (CompositeParser) config.getParser();
        List<Parser> parsers = cp.getAllComponentParsers();
        Parser p;
        // Will be the three parsers defined in the xml
        assertEquals(3, parsers.size());
        // Should have a wrapped DefaultParser, not the main DefaultParser,
        //  as it is excluded from handling certain classes
        p = parsers.get(0);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(DefaultParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        // Should have two others which claim things, which they wouldn't
        //  otherwise handle
        p = parsers.get(1);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(EmptyParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("hello/world", p.getSupportedTypes(null).iterator().next().toString());
        p = parsers.get(2);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(ErrorParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("fail/world", p.getSupportedTypes(null).iterator().next().toString());
    } catch (TikaException e) {
        fail("Unexpected TikaException: " + e);
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) ErrorParser(org.apache.tika.parser.ErrorParser) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Example 3 with DefaultParser

use of org.apache.tika.parser.DefaultParser in project tika by apache.

the class TikaParserConfigTest method testMimeExcludeInclude.

@Test
public void testMimeExcludeInclude() throws Exception {
    TikaConfig config = getConfig("TIKA-1558-blacklist.xml");
    assertNotNull(config.getParser());
    assertNotNull(config.getDetector());
    Parser parser = config.getParser();
    MediaType PDF = MediaType.application("pdf");
    MediaType JPEG = MediaType.image("jpeg");
    // Has two parsers
    assertEquals(CompositeParser.class, parser.getClass());
    CompositeParser cParser = (CompositeParser) parser;
    assertEquals(2, cParser.getAllComponentParsers().size());
    // Both are decorated
    assertTrue(cParser.getAllComponentParsers().get(0) instanceof ParserDecorator);
    assertTrue(cParser.getAllComponentParsers().get(1) instanceof ParserDecorator);
    ParserDecorator p0 = (ParserDecorator) cParser.getAllComponentParsers().get(0);
    ParserDecorator p1 = (ParserDecorator) cParser.getAllComponentParsers().get(1);
    // DefaultParser will be wrapped with excludes
    assertEquals(DefaultParser.class, p0.getWrappedParser().getClass());
    assertNotContained(PDF, p0.getSupportedTypes(context));
    assertContains(PDF, p0.getWrappedParser().getSupportedTypes(context));
    assertNotContained(JPEG, p0.getSupportedTypes(context));
    assertContains(JPEG, p0.getWrappedParser().getSupportedTypes(context));
    // Will have an empty parser for PDF
    assertEquals(EmptyParser.class, p1.getWrappedParser().getClass());
    assertEquals(1, p1.getSupportedTypes(context).size());
    assertContains(PDF, p1.getSupportedTypes(context));
    assertNotContained(PDF, p1.getWrappedParser().getSupportedTypes(context));
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) MediaType(org.apache.tika.mime.MediaType) Parser(org.apache.tika.parser.Parser) ExecutableParser(org.apache.tika.parser.executable.ExecutableParser) CompositeParser(org.apache.tika.parser.CompositeParser) XMLParser(org.apache.tika.parser.xml.XMLParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) Test(org.junit.Test)

Example 4 with DefaultParser

use of org.apache.tika.parser.DefaultParser in project tika by apache.

the class TikaParserConfigTest method defaultParserBlacklist.

/**
     * TIKA-1558 It should be possible to exclude Parsers from being picked up by
     * DefaultParser.
     */
@Test
public void defaultParserBlacklist() throws Exception {
    TikaConfig config = new TikaConfig();
    assertNotNull(config.getParser());
    assertNotNull(config.getDetector());
    CompositeParser cp = (CompositeParser) config.getParser();
    List<Parser> parsers = cp.getAllComponentParsers();
    boolean hasXML = false;
    for (Parser p : parsers) {
        if (p instanceof XMLParser) {
            hasXML = true;
            break;
        }
    }
    assertTrue("Default config should include an XMLParser.", hasXML);
    // This custom TikaConfig should exclude XMLParser and all of its subclasses.
    config = getConfig("TIKA-1558-blacklistsub.xml");
    cp = (CompositeParser) config.getParser();
    parsers = cp.getAllComponentParsers();
    for (Parser p : parsers) {
        if (p instanceof XMLParser)
            fail("Custom config should not include an XMLParser (" + p.getClass() + ").");
    }
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) XMLParser(org.apache.tika.parser.xml.XMLParser) Parser(org.apache.tika.parser.Parser) ExecutableParser(org.apache.tika.parser.executable.ExecutableParser) CompositeParser(org.apache.tika.parser.CompositeParser) XMLParser(org.apache.tika.parser.xml.XMLParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) Test(org.junit.Test)

Example 5 with DefaultParser

use of org.apache.tika.parser.DefaultParser in project tika by apache.

the class BundleIT method testBundleParsers.

@Test
public void testBundleParsers() throws Exception {
    // Get the classes found within OSGi
    ServiceReference<Parser> parserRef = bc.getServiceReference(Parser.class);
    DefaultParser parserService = (DefaultParser) bc.getService(parserRef);
    Set<String> osgiParsers = new HashSet<>();
    for (Parser p : parserService.getAllComponentParsers()) {
        osgiParsers.add(p.getClass().getName());
    }
    // Check we did get a few, just in case...
    assertTrue("Should have lots Parser names, found " + osgiParsers.size(), osgiParsers.size() > 15);
    // Get the raw parsers list from the traditional service loading mechanism
    CompositeParser parser = (CompositeParser) defaultParser;
    Set<String> rawParsers = new HashSet<>();
    for (Parser p : parser.getAllComponentParsers()) {
        if (p instanceof DefaultParser) {
            for (Parser pChild : ((DefaultParser) p).getAllComponentParsers()) {
                rawParsers.add(pChild.getClass().getName());
            }
        } else {
            rawParsers.add(p.getClass().getName());
        }
    }
    assertEquals(rawParsers, osgiParsers);
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) DefaultParser(org.apache.tika.parser.DefaultParser) ForkParser(org.apache.tika.fork.ForkParser) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) DefaultParser(org.apache.tika.parser.DefaultParser) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

DefaultParser (org.apache.tika.parser.DefaultParser)12 Parser (org.apache.tika.parser.Parser)10 Test (org.junit.Test)8 CompositeParser (org.apache.tika.parser.CompositeParser)7 MediaType (org.apache.tika.mime.MediaType)5 EmptyParser (org.apache.tika.parser.EmptyParser)4 ParserDecorator (org.apache.tika.parser.ParserDecorator)4 TikaTest (org.apache.tika.TikaTest)3 ParseContext (org.apache.tika.parser.ParseContext)3 ExecutableParser (org.apache.tika.parser.executable.ExecutableParser)3 XMLParser (org.apache.tika.parser.xml.XMLParser)3 TikaException (org.apache.tika.exception.TikaException)2 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)2 InputStream (java.io.InputStream)1 StringWriter (java.io.StringWriter)1 HashSet (java.util.HashSet)1 Properties (java.util.Properties)1 SolrException (org.apache.solr.common.SolrException)1 NamedList (org.apache.solr.common.util.NamedList)1 TikaConfig (org.apache.tika.config.TikaConfig)1