Search in sources :

Example 21 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class DIFParser method parse.

@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // TODO Auto-generated method stub
    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.startElement("p");
    TaggedContentHandler tagged = new TaggedContentHandler(handler);
    try {
        context.getSAXParser().parse(new CloseShieldInputStream(stream), new OfflineContentHandler(new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
    } catch (SAXException e) {
        tagged.throwIfCauseOf(e);
        throw new TikaException("XML parse error", e);
    } finally {
        xhtml.endElement("p");
        xhtml.endDocument();
    }
}
Also used : OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) TikaException(org.apache.tika.exception.TikaException) TaggedContentHandler(org.apache.tika.sax.TaggedContentHandler) EmbeddedContentHandler(org.apache.tika.sax.EmbeddedContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) SAXException(org.xml.sax.SAXException)

Example 22 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class EnviHeaderParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // Only outputting the MIME type as metadata
    metadata.set(Metadata.CONTENT_TYPE, ENVI_MIME_TYPE);
    // The following code was taken from the TXTParser
    // Automatically detect the character encoding
    TikaConfig tikaConfig = context.get(TikaConfig.class);
    if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
    }
    try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream), metadata, getEncodingDetector(context))) {
        Charset charset = reader.getCharset();
        MediaType type = new MediaType(MediaType.TEXT_PLAIN, charset);
        // deprecated, see TIKA-431
        metadata.set(Metadata.CONTENT_ENCODING, charset.name());
        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
        xhtml.startDocument();
        // text contents of the xhtml
        String line;
        while ((line = reader.readLine()) != null) {
            xhtml.startElement("p");
            xhtml.characters(line);
            xhtml.endElement("p");
        }
        xhtml.endDocument();
    }
}
Also used : TikaConfig(org.apache.tika.config.TikaConfig) AutoDetectReader(org.apache.tika.detect.AutoDetectReader) Charset(java.nio.charset.Charset) MediaType(org.apache.tika.mime.MediaType) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 23 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class EpubContentParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    SAXParser parser = context.getSAXParser();
    parser.parse(new CloseShieldInputStream(stream), new OfflineContentHandler(handler));
}
Also used : OfflineContentHandler(org.apache.tika.sax.OfflineContentHandler) SAXParser(javax.xml.parsers.SAXParser) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 24 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class TikaCLI method process.

public void process(String arg) throws Exception {
    if (arg.equals("-?") || arg.equals("--help")) {
        pipeMode = false;
        usage();
    } else if (arg.equals("-V") || arg.equals("--version")) {
        pipeMode = false;
        version();
    } else if (arg.equals("-v") || arg.equals("--verbose")) {
        org.apache.log4j.Logger.getRootLogger().setLevel(Level.DEBUG);
    } else if (arg.equals("-g") || arg.equals("--gui")) {
        pipeMode = false;
        if (configFilePath != null) {
            TikaGUI.main(new String[] { configFilePath });
        } else {
            TikaGUI.main(new String[0]);
        }
    } else if (arg.equals("--list-parser") || arg.equals("--list-parsers")) {
        pipeMode = false;
        displayParsers(false, false);
    } else if (arg.equals("--list-detector") || arg.equals("--list-detectors")) {
        pipeMode = false;
        displayDetectors();
    } else if (arg.equals("--list-parser-detail") || arg.equals("--list-parser-details")) {
        pipeMode = false;
        displayParsers(true, false);
    } else if (arg.equals("--list-parser-detail-apt") || arg.equals("--list-parser-details-apt")) {
        pipeMode = false;
        displayParsers(true, true);
    } else if (arg.equals("--list-met-models")) {
        pipeMode = false;
        displayMetModels();
    } else if (arg.equals("--list-supported-types")) {
        pipeMode = false;
        displaySupportedTypes();
    } else if (arg.startsWith("--compare-file-magic=")) {
        pipeMode = false;
        compareFileMagic(arg.substring(arg.indexOf('=') + 1));
    } else if (arg.equals("--dump-minimal-config")) {
        pipeMode = false;
        dumpConfig(TikaConfigSerializer.Mode.MINIMAL);
    } else if (arg.equals("--dump-current-config")) {
        pipeMode = false;
        dumpConfig(TikaConfigSerializer.Mode.CURRENT);
    } else if (arg.equals("--dump-static-config")) {
        pipeMode = false;
        dumpConfig(TikaConfigSerializer.Mode.STATIC);
    } else if (arg.equals("--dump-static-full-config")) {
        pipeMode = false;
        dumpConfig(TikaConfigSerializer.Mode.STATIC_FULL);
    } else if (arg.equals("--container-aware") || arg.equals("--container-aware-detector")) {
    // ignore, as container-aware detectors are now always used
    } else if (arg.equals("-f") || arg.equals("--fork")) {
        fork = true;
    } else if (arg.startsWith("--config=")) {
        configure(arg.substring("--config=".length()));
    } else if (arg.startsWith("--digest=")) {
        CommonsDigester.DigestAlgorithm[] algos = CommonsDigester.parse(arg.substring("--digest=".length()));
        digester = new CommonsDigester(MAX_MARK, algos);
        parser = new DigestingParser(parser, digester);
    } else if (arg.startsWith("-e")) {
        encoding = arg.substring("-e".length());
    } else if (arg.startsWith("--encoding=")) {
        encoding = arg.substring("--encoding=".length());
    } else if (arg.startsWith("-p") && !arg.equals("-p")) {
        password = arg.substring("-p".length());
    } else if (arg.startsWith("--password=")) {
        password = arg.substring("--password=".length());
    } else if (arg.equals("-j") || arg.equals("--json")) {
        type = JSON;
    } else if (arg.equals("-J") || arg.equals("--jsonRecursive")) {
        recursiveJSON = true;
    } else if (arg.equals("-y") || arg.equals("--xmp")) {
        type = XMP;
    } else if (arg.equals("-x") || arg.equals("--xml")) {
        type = XML;
    } else if (arg.equals("-h") || arg.equals("--html")) {
        type = HTML;
    } else if (arg.equals("-t") || arg.equals("--text")) {
        type = TEXT;
    } else if (arg.equals("-T") || arg.equals("--text-main")) {
        type = TEXT_MAIN;
    } else if (arg.equals("-m") || arg.equals("--metadata")) {
        type = METADATA;
    } else if (arg.equals("-l") || arg.equals("--language")) {
        type = LANGUAGE;
    } else if (arg.equals("-d") || arg.equals("--detect")) {
        type = DETECT;
    } else if (arg.startsWith("--extract-dir=")) {
        extractDir = new File(arg.substring("--extract-dir=".length()));
    } else if (arg.equals("-z") || arg.equals("--extract")) {
        type = NO_OUTPUT;
        context.set(EmbeddedDocumentExtractor.class, new FileEmbeddedDocumentExtractor());
    } else if (arg.equals("-r") || arg.equals("--pretty-print")) {
        prettyPrint = true;
    } else if (arg.equals("-p") || arg.equals("--port") || arg.equals("-s") || arg.equals("--server")) {
        serverMode = true;
        pipeMode = false;
    } else if (arg.startsWith("-c")) {
        URI uri = new URI(arg.substring("-c".length()));
        parser = new NetworkParser(uri);
    } else if (arg.startsWith("--client=")) {
        URI uri = new URI(arg.substring("--client=".length()));
        parser = new NetworkParser(uri);
    } else {
        pipeMode = false;
        if (serverMode) {
            new TikaServer(Integer.parseInt(arg)).start();
        } else if (arg.equals("-")) {
            try (InputStream stream = TikaInputStream.get(new CloseShieldInputStream(System.in))) {
                type.process(stream, System.out, new Metadata());
            }
        } else {
            URL url;
            File file = new File(arg);
            if (file.isFile()) {
                url = file.toURI().toURL();
            } else {
                url = new URL(arg);
            }
            if (recursiveJSON) {
                handleRecursiveJson(url, System.out);
            } else {
                Metadata metadata = new Metadata();
                try (InputStream input = TikaInputStream.get(url, metadata)) {
                    type.process(input, System.out, metadata);
                } finally {
                    System.out.flush();
                }
            }
        }
    }
}
Also used : CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) DocumentInputStream(org.apache.poi.poifs.filesystem.DocumentInputStream) InputStream(java.io.InputStream) JsonMetadata(org.apache.tika.metadata.serialization.JsonMetadata) Metadata(org.apache.tika.metadata.Metadata) XMPMetadata(org.apache.tika.xmp.XMPMetadata) DigestingParser(org.apache.tika.parser.DigestingParser) URI(java.net.URI) NetworkParser(org.apache.tika.parser.NetworkParser) URL(java.net.URL) CommonsDigester(org.apache.tika.parser.utils.CommonsDigester) File(java.io.File) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Example 25 with CloseShieldInputStream

use of org.apache.commons.io.input.CloseShieldInputStream in project tika by apache.

the class OfficeParser method parse.

/**
     * Extracts properties and text from an MS Document input stream
     */
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    configure(context);
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    final DirectoryNode root;
    TikaInputStream tstream = TikaInputStream.cast(stream);
    NPOIFSFileSystem mustCloseFs = null;
    try {
        if (tstream == null) {
            mustCloseFs = new NPOIFSFileSystem(new CloseShieldInputStream(stream));
            root = mustCloseFs.getRoot();
        } else {
            final Object container = tstream.getOpenContainer();
            if (container instanceof NPOIFSFileSystem) {
                root = ((NPOIFSFileSystem) container).getRoot();
            } else if (container instanceof DirectoryNode) {
                root = (DirectoryNode) container;
            } else {
                NPOIFSFileSystem fs = null;
                if (tstream.hasFile()) {
                    fs = new NPOIFSFileSystem(tstream.getFile(), true);
                } else {
                    fs = new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
                }
                //tstream will close the fs, no need to close this below
                tstream.setOpenContainer(fs);
                root = fs.getRoot();
            }
        }
        parse(root, context, metadata, xhtml);
        OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class);
        if (officeParserConfig.getExtractMacros()) {
            //now try to get macros
            extractMacros(root.getNFileSystem(), xhtml, EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context));
        }
    } finally {
        IOUtils.closeQuietly(mustCloseFs);
    }
    xhtml.endDocument();
}
Also used : NPOIFSFileSystem(org.apache.poi.poifs.filesystem.NPOIFSFileSystem) TikaInputStream(org.apache.tika.io.TikaInputStream) DirectoryNode(org.apache.poi.poifs.filesystem.DirectoryNode) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Aggregations

CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)28 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 TikaException (org.apache.tika.exception.TikaException)12 OfflineContentHandler (org.apache.tika.sax.OfflineContentHandler)8 InputStream (java.io.InputStream)7 TikaInputStream (org.apache.tika.io.TikaInputStream)7 AutoDetectReader (org.apache.tika.detect.AutoDetectReader)6 MediaType (org.apache.tika.mime.MediaType)6 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)5 Charset (java.nio.charset.Charset)4 TikaConfig (org.apache.tika.config.TikaConfig)4 SAXException (org.xml.sax.SAXException)4 BufferedInputStream (java.io.BufferedInputStream)3 EmbeddedDocumentExtractor (org.apache.tika.extractor.EmbeddedDocumentExtractor)3 Metadata (org.apache.tika.metadata.Metadata)3 TaggedContentHandler (org.apache.tika.sax.TaggedContentHandler)3 InputSource (org.xml.sax.InputSource)3 IOException (java.io.IOException)2 SAXParser (javax.xml.parsers.SAXParser)2 ZipArchiveEntry (org.apache.commons.compress.archivers.zip.ZipArchiveEntry)2