Search in sources :

Example 6 with CompressorInputStream

use of org.apache.commons.compress.compressors.CompressorInputStream in project cloudstack by apache.

the class VhdProcessor method checkCompressed.

private boolean checkCompressed(String fileName) throws IOException {
    FileInputStream fin = null;
    BufferedInputStream bin = null;
    CompressorInputStream cin = null;
    try {
        fin = new FileInputStream(fileName);
        bin = new BufferedInputStream(fin);
        cin = new CompressorStreamFactory().createCompressorInputStream(bin);
    } catch (CompressorException e) {
        s_logger.warn(e.getMessage());
        return false;
    } catch (FileNotFoundException e) {
        s_logger.warn(e.getMessage());
        return false;
    } finally {
        if (cin != null)
            cin.close();
        else if (bin != null)
            bin.close();
    }
    return true;
}
Also used : BufferedInputStream(java.io.BufferedInputStream) CompressorException(org.apache.commons.compress.compressors.CompressorException) FileNotFoundException(java.io.FileNotFoundException) CompressorStreamFactory(org.apache.commons.compress.compressors.CompressorStreamFactory) CompressorInputStream(org.apache.commons.compress.compressors.CompressorInputStream) FileInputStream(java.io.FileInputStream)

Example 7 with CompressorInputStream

use of org.apache.commons.compress.compressors.CompressorInputStream in project tika by apache.

the class CompressorParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    // should not be closed
    if (stream.markSupported()) {
        stream = new CloseShieldInputStream(stream);
    } else {
        // Ensure that the stream supports the mark feature
        stream = new BufferedInputStream(new CloseShieldInputStream(stream));
    }
    CompressorInputStream cis;
    try {
        CompressorParserOptions options = context.get(CompressorParserOptions.class, new CompressorParserOptions() {

            public boolean decompressConcatenated(Metadata metadata) {
                return false;
            }
        });
        CompressorStreamFactory factory = new CompressorStreamFactory(options.decompressConcatenated(metadata), memoryLimitInKb);
        cis = factory.createCompressorInputStream(stream);
    } catch (CompressorException e) {
        if (e.getCause() != null && e.getCause() instanceof MemoryLimitException) {
            throw new TikaMemoryLimitException(e.getMessage());
        }
        throw new TikaException("Unable to uncompress document stream", e);
    }
    MediaType type = getMediaType(cis);
    if (!type.equals(MediaType.OCTET_STREAM)) {
        metadata.set(CONTENT_TYPE, type.toString());
    }
    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    try {
        Metadata entrydata = new Metadata();
        String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
        if (name != null) {
            if (name.endsWith(".tbz")) {
                name = name.substring(0, name.length() - 4) + ".tar";
            } else if (name.endsWith(".tbz2")) {
                name = name.substring(0, name.length() - 5) + ".tar";
            } else if (name.endsWith(".bz")) {
                name = name.substring(0, name.length() - 3);
            } else if (name.endsWith(".bz2")) {
                name = name.substring(0, name.length() - 4);
            } else if (name.endsWith(".xz")) {
                name = name.substring(0, name.length() - 3);
            } else if (name.endsWith(".zlib")) {
                name = name.substring(0, name.length() - 5);
            } else if (name.endsWith(".pack")) {
                name = name.substring(0, name.length() - 5);
            } else if (name.length() > 0) {
                name = GzipUtils.getUncompressedFilename(name);
            }
            entrydata.set(Metadata.RESOURCE_NAME_KEY, name);
        }
        // Use the delegate parser to parse the compressed document
        EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
        if (extractor.shouldParseEmbedded(entrydata)) {
            extractor.parseEmbedded(cis, xhtml, entrydata, true);
        }
    } finally {
        cis.close();
    }
    xhtml.endDocument();
}
Also used : TikaException(org.apache.tika.exception.TikaException) EmbeddedDocumentExtractor(org.apache.tika.extractor.EmbeddedDocumentExtractor) Metadata(org.apache.tika.metadata.Metadata) CompressorStreamFactory(org.apache.commons.compress.compressors.CompressorStreamFactory) CompressorInputStream(org.apache.commons.compress.compressors.CompressorInputStream) SnappyCompressorInputStream(org.apache.commons.compress.compressors.snappy.SnappyCompressorInputStream) XZCompressorInputStream(org.apache.commons.compress.compressors.xz.XZCompressorInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) DeflateCompressorInputStream(org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream) LZMACompressorInputStream(org.apache.commons.compress.compressors.lzma.LZMACompressorInputStream) FramedSnappyCompressorInputStream(org.apache.commons.compress.compressors.snappy.FramedSnappyCompressorInputStream) ZCompressorInputStream(org.apache.commons.compress.compressors.z.ZCompressorInputStream) Pack200CompressorInputStream(org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) MemoryLimitException(org.apache.commons.compress.MemoryLimitException) TikaMemoryLimitException(org.apache.tika.exception.TikaMemoryLimitException) BufferedInputStream(java.io.BufferedInputStream) CompressorException(org.apache.commons.compress.compressors.CompressorException) TikaMemoryLimitException(org.apache.tika.exception.TikaMemoryLimitException) MediaType(org.apache.tika.mime.MediaType) CloseShieldInputStream(org.apache.commons.io.input.CloseShieldInputStream)

Aggregations

CompressorInputStream (org.apache.commons.compress.compressors.CompressorInputStream)7 CompressorStreamFactory (org.apache.commons.compress.compressors.CompressorStreamFactory)5 BufferedInputStream (java.io.BufferedInputStream)4 FileInputStream (java.io.FileInputStream)3 CompressorException (org.apache.commons.compress.compressors.CompressorException)3 BZip2CompressorInputStream (org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream)3 GzipCompressorInputStream (org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream)3 IOException (java.io.IOException)2 Pack200CompressorInputStream (org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream)2 XZCompressorInputStream (org.apache.commons.compress.compressors.xz.XZCompressorInputStream)2 MediaType (org.apache.tika.mime.MediaType)2 Configuration (com.alibaba.datax.common.util.Configuration)1 SnappyCodec (io.airlift.compress.snappy.SnappyCodec)1 SnappyFramedInputStream (io.airlift.compress.snappy.SnappyFramedInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 File (java.io.File)1 FileNotFoundException (java.io.FileNotFoundException)1 Path (java.nio.file.Path)1 MemoryLimitException (org.apache.commons.compress.MemoryLimitException)1 ArchiveInputStream (org.apache.commons.compress.archivers.ArchiveInputStream)1