Search in sources :

Example 1 with Parser

use of org.apache.tika.parser.Parser in project lucene-solr by apache.

the class TikaEntityProcessor method nextRow.

@Override
public Map<String, Object> nextRow() {
    if (done)
        return null;
    Map<String, Object> row = new HashMap<>();
    DataSource<InputStream> dataSource = context.getDataSource();
    InputStream is = dataSource.getData(context.getResolvedEntityAttribute(URL));
    ContentHandler contentHandler = null;
    Metadata metadata = new Metadata();
    StringWriter sw = new StringWriter();
    try {
        if ("html".equals(format)) {
            contentHandler = getHtmlHandler(sw);
        } else if ("xml".equals(format)) {
            contentHandler = getXmlContentHandler(sw);
        } else if ("text".equals(format)) {
            contentHandler = getTextContentHandler(sw);
        } else if ("none".equals(format)) {
            contentHandler = new DefaultHandler();
        }
    } catch (TransformerConfigurationException e) {
        wrapAndThrow(SEVERE, e, "Unable to create content handler");
    }
    Parser tikaParser = null;
    if (parser.equals(AUTO_PARSER)) {
        tikaParser = new AutoDetectParser(tikaConfig);
    } else {
        tikaParser = context.getSolrCore().getResourceLoader().newInstance(parser, Parser.class);
    }
    try {
        ParseContext context = new ParseContext();
        if ("identity".equals(htmlMapper)) {
            context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
        }
        if (extractEmbedded) {
            context.set(Parser.class, tikaParser);
        }
        tikaParser.parse(is, contentHandler, metadata, context);
    } catch (Exception e) {
        if (SKIP.equals(onError)) {
            throw new DataImportHandlerException(DataImportHandlerException.SKIP_ROW, "Document skipped :" + e.getMessage());
        }
        wrapAndThrow(SEVERE, e, "Unable to read content");
    }
    IOUtils.closeQuietly(is);
    for (Map<String, String> field : context.getAllEntityFields()) {
        if (!"true".equals(field.get("meta")))
            continue;
        String col = field.get(COLUMN);
        String s = metadata.get(col);
        if (s != null)
            row.put(col, s);
    }
    if (!"none".equals(format))
        row.put("text", sw.toString());
    tryToAddLatLon(metadata, row);
    done = true;
    return row;
}
Also used : TransformerConfigurationException(javax.xml.transform.TransformerConfigurationException) HashMap(java.util.HashMap) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) TransformerConfigurationException(javax.xml.transform.TransformerConfigurationException) SAXException(org.xml.sax.SAXException) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) StringWriter(java.io.StringWriter) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Example 2 with Parser

use of org.apache.tika.parser.Parser in project lucene-solr by apache.

the class ExtractingDocumentLoader method load.

@Override
public void load(SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
        //Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
        MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
        parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
        parser = autoDetectParser;
    }
    if (parser != null) {
        Metadata metadata = new Metadata();
        // If you specify the resource name (the filename, roughly) with this parameter,
        // then Tika can make use of it in guessing the appropriate MIME type:
        String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
        if (resourceName != null) {
            metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
        }
        // Provide stream's content type as hint for auto detection
        if (stream.getContentType() != null) {
            metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
        }
        InputStream inputStream = null;
        try {
            inputStream = stream.getStream();
            metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
            metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
            metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
            metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
            // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
            String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
            if (charset != null) {
                metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
            }
            String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
            boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
            SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema());
            ContentHandler parsingHandler = handler;
            StringWriter writer = null;
            BaseMarkupSerializer serializer = null;
            if (extractOnly == true) {
                String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
                writer = new StringWriter();
                if (extractFormat.equals(TEXT_FORMAT)) {
                    serializer = new TextSerializer();
                    serializer.setOutputCharStream(writer);
                    serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
                } else {
                    serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
                }
                if (xpathExpr != null) {
                    Matcher matcher = PARSER.parse(xpathExpr);
                    //The MatchingContentHandler does not invoke startDocument.  See http://tika.markmail.org/message/kknu3hw7argwiqin
                    serializer.startDocument();
                    parsingHandler = new MatchingContentHandler(serializer, matcher);
                } else {
                    parsingHandler = serializer;
                }
            } else if (xpathExpr != null) {
                Matcher matcher = PARSER.parse(xpathExpr);
                parsingHandler = new MatchingContentHandler(handler, matcher);
            }
            try {
                //potentially use a wrapper handler for parsing, but we still need the SolrContentHandler for getting the document.
                ParseContext context = parseContextConfig.create();
                context.set(Parser.class, parser);
                context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);
                // Password handling
                RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
                String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
                if (pwMapFile != null && pwMapFile.length() > 0) {
                    InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
                    if (is != null) {
                        log.debug("Password file supplied: " + pwMapFile);
                        epp.parse(is);
                    }
                }
                context.set(PasswordProvider.class, epp);
                String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
                if (resourcePassword != null) {
                    epp.setExplicitPassword(resourcePassword);
                    log.debug("Literal password supplied for file " + resourceName);
                }
                parser.parse(inputStream, parsingHandler, metadata, context);
            } catch (TikaException e) {
                if (ignoreTikaException)
                    log.warn(new StringBuilder("skip extracting text due to ").append(e.getLocalizedMessage()).append(". metadata=").append(metadata.toString()).toString());
                else
                    throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
            }
            if (extractOnly == false) {
                addDoc(handler);
            } else {
                //serializer is not null, so we need to call endDoc on it if using xpath
                if (xpathExpr != null) {
                    serializer.endDocument();
                }
                rsp.add(stream.getName(), writer.toString());
                writer.close();
                String[] names = metadata.names();
                NamedList metadataNL = new NamedList();
                for (int i = 0; i < names.length; i++) {
                    String[] vals = metadata.getValues(names[i]);
                    metadataNL.add(names[i], vals);
                }
                rsp.add(stream.getName() + "_metadata", metadataNL);
            }
        } catch (SAXException e) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        } finally {
            IOUtils.closeQuietly(inputStream);
        }
    } else {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers.  Please supply the " + ExtractingParams.STREAM_TYPE + " parameter.");
    }
}
Also used : Matcher(org.apache.tika.sax.xpath.Matcher) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) Metadata(org.apache.tika.metadata.Metadata) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) SAXException(org.xml.sax.SAXException) StringWriter(java.io.StringWriter) MediaType(org.apache.tika.mime.MediaType) SolrException(org.apache.solr.common.SolrException) DefaultParser(org.apache.tika.parser.DefaultParser) XMLSerializer(org.apache.xml.serialize.XMLSerializer) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) NamedList(org.apache.solr.common.util.NamedList) BaseMarkupSerializer(org.apache.xml.serialize.BaseMarkupSerializer) OutputFormat(org.apache.xml.serialize.OutputFormat) Parser(org.apache.tika.parser.Parser) XPathParser(org.apache.tika.sax.xpath.XPathParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) TextSerializer(org.apache.xml.serialize.TextSerializer) ParseContext(org.apache.tika.parser.ParseContext)

Example 3 with Parser

use of org.apache.tika.parser.Parser in project tika by apache.

the class TikaConfigTest method defaultParserWithExcludes.

/**
     * TIKA-1445 It should be possible to exclude DefaultParser from
     *  certain types, so another parser explicitly listed will take them
     */
@Test
public void defaultParserWithExcludes() throws Exception {
    try {
        TikaConfig config = getConfig("TIKA-1445-default-except.xml");
        CompositeParser cp = (CompositeParser) config.getParser();
        List<Parser> parsers = cp.getAllComponentParsers();
        Parser p;
        // Will be the three parsers defined in the xml
        assertEquals(3, parsers.size());
        // Should have a wrapped DefaultParser, not the main DefaultParser,
        //  as it is excluded from handling certain classes
        p = parsers.get(0);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(DefaultParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        // Should have two others which claim things, which they wouldn't
        //  otherwise handle
        p = parsers.get(1);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(EmptyParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("hello/world", p.getSupportedTypes(null).iterator().next().toString());
        p = parsers.get(2);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(ErrorParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("fail/world", p.getSupportedTypes(null).iterator().next().toString());
    } catch (TikaException e) {
        fail("Unexpected TikaException: " + e);
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) ErrorParser(org.apache.tika.parser.ErrorParser) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Example 4 with Parser

use of org.apache.tika.parser.Parser in project tika by apache.

the class TikaTest method getRecursiveMetadata.

protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, new DefaultHandler(), new Metadata(), context);
    }
    return wrapper.getMetadata();
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 5 with Parser

use of org.apache.tika.parser.Parser in project tika by apache.

the class ExternalParsersFactory method attachExternalParsers.

public static void attachExternalParsers(List<ExternalParser> parsers, TikaConfig config) {
    Parser parser = config.getParser();
    if (parser instanceof CompositeParser) {
        CompositeParser cParser = (CompositeParser) parser;
        Map<MediaType, Parser> parserMap = cParser.getParsers();
    }
// TODO
}
Also used : CompositeParser(org.apache.tika.parser.CompositeParser) MediaType(org.apache.tika.mime.MediaType) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser)

Aggregations

Parser (org.apache.tika.parser.Parser)184 Metadata (org.apache.tika.metadata.Metadata)141 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)135 Test (org.junit.Test)123 InputStream (java.io.InputStream)122 ParseContext (org.apache.tika.parser.ParseContext)116 ContentHandler (org.xml.sax.ContentHandler)110 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)104 TikaTest (org.apache.tika.TikaTest)62 TikaInputStream (org.apache.tika.io.TikaInputStream)57 CompositeParser (org.apache.tika.parser.CompositeParser)51 ByteArrayInputStream (java.io.ByteArrayInputStream)44 DefaultHandler (org.xml.sax.helpers.DefaultHandler)23 EmptyParser (org.apache.tika.parser.EmptyParser)22 TikaException (org.apache.tika.exception.TikaException)21 MediaType (org.apache.tika.mime.MediaType)20 FileInputStream (java.io.FileInputStream)18 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)18 HtmlParser (org.apache.tika.parser.html.HtmlParser)16 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)16