Search in sources :

Example 31 with GzipCompressorInputStream

use of org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream in project apex-malhar by apache.

the class HdfsTestSource method start.

@Override
public void start() {
    super.start();
    emitTimer = new Timer();
    final ChannelProcessor channelProcessor = getChannelProcessor();
    emitTimer.scheduleAtFixedRate(new TimerTask() {

        @Override
        public void run() {
            int lineCount = 0;
            events.clear();
            try {
                while (lineCount < rate && !finished) {
                    String line = br.readLine();
                    if (line == null) {
                        logger.debug("completed file {}", currentFile);
                        br.close();
                        currentFile++;
                        if (currentFile == dataFiles.size()) {
                            logger.info("finished all files");
                            finished = true;
                            break;
                        }
                        Path filePath = new Path(dataFiles.get(currentFile));
                        br = new BufferedReader(new InputStreamReader(new GzipCompressorInputStream(fs.open(filePath))));
                        logger.info("opening file {}. {}", currentFile, filePath);
                        continue;
                    }
                    lineCount++;
                    Event flumeEvent = EventBuilder.withBody(line.getBytes());
                    events.add(flumeEvent);
                }
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
            if (events.size() > 0) {
                channelProcessor.processEventBatch(events);
            }
            if (finished) {
                emitTimer.cancel();
            }
        }
    }, 0, 1000);
}
Also used : Path(org.apache.hadoop.fs.Path) GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) Timer(java.util.Timer) TimerTask(java.util.TimerTask) InputStreamReader(java.io.InputStreamReader) BufferedReader(java.io.BufferedReader) Event(org.apache.flume.Event) IOException(java.io.IOException) ChannelProcessor(org.apache.flume.channel.ChannelProcessor)

Example 32 with GzipCompressorInputStream

use of org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream in project deeplearning4j by deeplearning4j.

the class ArchiveUtils method unzipFileTo.

/**
     * Extracts files to the specified destination
     * @param file the file to extract to
     * @param dest the destination directory
     * @throws IOException
     */
public static void unzipFileTo(String file, String dest) throws IOException {
    File target = new File(file);
    if (!target.exists())
        throw new IllegalArgumentException("Archive doesnt exist");
    FileInputStream fin = new FileInputStream(target);
    int BUFFER = 2048;
    byte[] data = new byte[BUFFER];
    if (file.endsWith(".zip")) {
        //getFromOrigin the zip file content
        ZipInputStream zis = new ZipInputStream(fin);
        //getFromOrigin the zipped file list entry
        ZipEntry ze = zis.getNextEntry();
        while (ze != null) {
            String fileName = ze.getName();
            File newFile = new File(dest + File.separator + fileName);
            log.info("file unzip : " + newFile.getAbsoluteFile());
            //createComplex all non exists folders
            //else you will hit FileNotFoundException for compressed folder
            new File(newFile.getParent()).mkdirs();
            FileOutputStream fos = new FileOutputStream(newFile);
            int len;
            while ((len = zis.read(data)) > 0) {
                fos.write(data, 0, len);
            }
            fos.close();
            ze = zis.getNextEntry();
        }
        zis.closeEntry();
        zis.close();
    } else if (file.endsWith(".tar.gz") || file.endsWith(".tgz")) {
        BufferedInputStream in = new BufferedInputStream(fin);
        GzipCompressorInputStream gzIn = new GzipCompressorInputStream(in);
        TarArchiveInputStream tarIn = new TarArchiveInputStream(gzIn);
        TarArchiveEntry entry = null;
        while ((entry = (TarArchiveEntry) tarIn.getNextEntry()) != null) {
            log.info("Extracting: " + entry.getName());
            if (entry.isDirectory()) {
                File f = new File(dest + File.separator + entry.getName());
                f.mkdirs();
            } else /**
                 * If the entry is a file,write the decompressed file to the disk
                 * and close destination stream.
                 **/
            {
                int count;
                FileOutputStream fos = new FileOutputStream(dest + File.separator + entry.getName());
                BufferedOutputStream destStream = new BufferedOutputStream(fos, BUFFER);
                while ((count = tarIn.read(data, 0, BUFFER)) != -1) {
                    destStream.write(data, 0, count);
                }
                destStream.flush();
                IOUtils.closeQuietly(destStream);
            }
        }
        /** Close the input stream **/
        tarIn.close();
    } else if (file.endsWith(".gz")) {
        GZIPInputStream is2 = new GZIPInputStream(fin);
        File extracted = new File(target.getParent(), target.getName().replace(".gz", ""));
        if (extracted.exists())
            extracted.delete();
        extracted.createNewFile();
        OutputStream fos = FileUtils.openOutputStream(extracted);
        IOUtils.copyLarge(is2, fos);
        is2.close();
        fos.flush();
        fos.close();
    }
    target.delete();
}
Also used : GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) ZipEntry(java.util.zip.ZipEntry) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) GZIPInputStream(java.util.zip.GZIPInputStream) ZipInputStream(java.util.zip.ZipInputStream)

Example 33 with GzipCompressorInputStream

use of org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream in project neo4j by neo4j.

the class Loader method openArchiveIn.

private static ArchiveInputStream openArchiveIn(Path archive) throws IOException, IncorrectFormat {
    InputStream input = Files.newInputStream(archive);
    GzipCompressorInputStream compressor;
    try {
        compressor = new GzipCompressorInputStream(input);
    } catch (IOException e) {
        input.close();
        throw new IncorrectFormat(archive, e);
    }
    return new TarArchiveInputStream(compressor);
}
Also used : GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) ArchiveInputStream(org.apache.commons.compress.archivers.ArchiveInputStream) GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) InputStream(java.io.InputStream) IOException(java.io.IOException)

Example 34 with GzipCompressorInputStream

use of org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream in project nifi by apache.

the class CompressContent method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final ComponentLog logger = getLogger();
    final long sizeBeforeCompression = flowFile.getSize();
    final String compressionMode = context.getProperty(MODE).getValue();
    String compressionFormatValue = context.getProperty(COMPRESSION_FORMAT).getValue();
    if (compressionFormatValue.equals(COMPRESSION_FORMAT_ATTRIBUTE)) {
        final String mimeType = flowFile.getAttribute(CoreAttributes.MIME_TYPE.key());
        if (mimeType == null) {
            logger.error("No {} attribute exists for {}; routing to failure", new Object[] { CoreAttributes.MIME_TYPE.key(), flowFile });
            session.transfer(flowFile, REL_FAILURE);
            return;
        }
        compressionFormatValue = compressionFormatMimeTypeMap.get(mimeType);
        if (compressionFormatValue == null) {
            logger.info("Mime Type of {} is '{}', which does not indicate a supported Compression Format; routing to success without decompressing", new Object[] { flowFile, mimeType });
            session.transfer(flowFile, REL_SUCCESS);
            return;
        }
    }
    final String compressionFormat = compressionFormatValue;
    final AtomicReference<String> mimeTypeRef = new AtomicReference<>(null);
    final StopWatch stopWatch = new StopWatch(true);
    final String fileExtension;
    switch(compressionFormat.toLowerCase()) {
        case COMPRESSION_FORMAT_GZIP:
            fileExtension = ".gz";
            break;
        case COMPRESSION_FORMAT_LZMA:
            fileExtension = ".lzma";
            break;
        case COMPRESSION_FORMAT_XZ_LZMA2:
            fileExtension = ".xz";
            break;
        case COMPRESSION_FORMAT_BZIP2:
            fileExtension = ".bz2";
            break;
        case COMPRESSION_FORMAT_SNAPPY:
            fileExtension = ".snappy";
            break;
        case COMPRESSION_FORMAT_SNAPPY_FRAMED:
            fileExtension = ".sz";
            break;
        default:
            fileExtension = "";
            break;
    }
    try {
        flowFile = session.write(flowFile, new StreamCallback() {

            @Override
            public void process(final InputStream rawIn, final OutputStream rawOut) throws IOException {
                final OutputStream compressionOut;
                final InputStream compressionIn;
                final OutputStream bufferedOut = new BufferedOutputStream(rawOut, 65536);
                final InputStream bufferedIn = new BufferedInputStream(rawIn, 65536);
                try {
                    if (MODE_COMPRESS.equalsIgnoreCase(compressionMode)) {
                        compressionIn = bufferedIn;
                        switch(compressionFormat.toLowerCase()) {
                            case COMPRESSION_FORMAT_GZIP:
                                final int compressionLevel = context.getProperty(COMPRESSION_LEVEL).asInteger();
                                compressionOut = new GZIPOutputStream(bufferedOut, compressionLevel);
                                mimeTypeRef.set("application/gzip");
                                break;
                            case COMPRESSION_FORMAT_LZMA:
                                compressionOut = new LzmaOutputStream.Builder(bufferedOut).build();
                                mimeTypeRef.set("application/x-lzma");
                                break;
                            case COMPRESSION_FORMAT_XZ_LZMA2:
                                compressionOut = new XZOutputStream(bufferedOut, new LZMA2Options());
                                mimeTypeRef.set("application/x-xz");
                                break;
                            case COMPRESSION_FORMAT_SNAPPY:
                                compressionOut = new SnappyOutputStream(bufferedOut);
                                mimeTypeRef.set("application/x-snappy");
                                break;
                            case COMPRESSION_FORMAT_SNAPPY_FRAMED:
                                compressionOut = new SnappyFramedOutputStream(bufferedOut);
                                mimeTypeRef.set("application/x-snappy-framed");
                                break;
                            case COMPRESSION_FORMAT_BZIP2:
                            default:
                                mimeTypeRef.set("application/x-bzip2");
                                compressionOut = new CompressorStreamFactory().createCompressorOutputStream(compressionFormat.toLowerCase(), bufferedOut);
                                break;
                        }
                    } else {
                        compressionOut = bufferedOut;
                        switch(compressionFormat.toLowerCase()) {
                            case COMPRESSION_FORMAT_LZMA:
                                compressionIn = new LzmaInputStream(bufferedIn, new Decoder());
                                break;
                            case COMPRESSION_FORMAT_XZ_LZMA2:
                                compressionIn = new XZInputStream(bufferedIn);
                                break;
                            case COMPRESSION_FORMAT_BZIP2:
                                // need this two-arg constructor to support concatenated streams
                                compressionIn = new BZip2CompressorInputStream(bufferedIn, true);
                                break;
                            case COMPRESSION_FORMAT_GZIP:
                                compressionIn = new GzipCompressorInputStream(bufferedIn, true);
                                break;
                            case COMPRESSION_FORMAT_SNAPPY:
                                compressionIn = new SnappyInputStream(bufferedIn);
                                break;
                            case COMPRESSION_FORMAT_SNAPPY_FRAMED:
                                compressionIn = new SnappyFramedInputStream(bufferedIn);
                                break;
                            default:
                                compressionIn = new CompressorStreamFactory().createCompressorInputStream(compressionFormat.toLowerCase(), bufferedIn);
                        }
                    }
                } catch (final Exception e) {
                    closeQuietly(bufferedOut);
                    throw new IOException(e);
                }
                try (final InputStream in = compressionIn;
                    final OutputStream out = compressionOut) {
                    final byte[] buffer = new byte[8192];
                    int len;
                    while ((len = in.read(buffer)) > 0) {
                        out.write(buffer, 0, len);
                    }
                    out.flush();
                }
            }
        });
        stopWatch.stop();
        final long sizeAfterCompression = flowFile.getSize();
        if (MODE_DECOMPRESS.equalsIgnoreCase(compressionMode)) {
            flowFile = session.removeAttribute(flowFile, CoreAttributes.MIME_TYPE.key());
            if (context.getProperty(UPDATE_FILENAME).asBoolean()) {
                final String filename = flowFile.getAttribute(CoreAttributes.FILENAME.key());
                if (filename.toLowerCase().endsWith(fileExtension)) {
                    flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), filename.substring(0, filename.length() - fileExtension.length()));
                }
            }
        } else {
            flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeTypeRef.get());
            if (context.getProperty(UPDATE_FILENAME).asBoolean()) {
                final String filename = flowFile.getAttribute(CoreAttributes.FILENAME.key());
                flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), filename + fileExtension);
            }
        }
        logger.info("Successfully {}ed {} using {} compression format; size changed from {} to {} bytes", new Object[] { compressionMode.toLowerCase(), flowFile, compressionFormat, sizeBeforeCompression, sizeAfterCompression });
        session.getProvenanceReporter().modifyContent(flowFile, stopWatch.getDuration(TimeUnit.MILLISECONDS));
        session.transfer(flowFile, REL_SUCCESS);
    } catch (final ProcessException e) {
        logger.error("Unable to {} {} using {} compression format due to {}; routing to failure", new Object[] { compressionMode.toLowerCase(), flowFile, compressionFormat, e });
        session.transfer(flowFile, REL_FAILURE);
    }
}
Also used : GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) SnappyFramedOutputStream(org.xerial.snappy.SnappyFramedOutputStream) BufferedOutputStream(org.apache.nifi.stream.io.BufferedOutputStream) GZIPOutputStream(org.apache.nifi.stream.io.GZIPOutputStream) SnappyFramedOutputStream(org.xerial.snappy.SnappyFramedOutputStream) SnappyOutputStream(org.xerial.snappy.SnappyOutputStream) OutputStream(java.io.OutputStream) XZOutputStream(org.tukaani.xz.XZOutputStream) LzmaOutputStream(lzma.streams.LzmaOutputStream) CompressorStreamFactory(org.apache.commons.compress.compressors.CompressorStreamFactory) Decoder(lzma.sdk.lzma.Decoder) XZOutputStream(org.tukaani.xz.XZOutputStream) LzmaInputStream(lzma.streams.LzmaInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) BufferedInputStream(org.apache.nifi.stream.io.BufferedInputStream) GZIPOutputStream(org.apache.nifi.stream.io.GZIPOutputStream) SnappyInputStream(org.xerial.snappy.SnappyInputStream) BufferedOutputStream(org.apache.nifi.stream.io.BufferedOutputStream) FlowFile(org.apache.nifi.flowfile.FlowFile) XZInputStream(org.tukaani.xz.XZInputStream) LzmaInputStream(lzma.streams.LzmaInputStream) XZInputStream(org.tukaani.xz.XZInputStream) BufferedInputStream(org.apache.nifi.stream.io.BufferedInputStream) SnappyInputStream(org.xerial.snappy.SnappyInputStream) SnappyFramedInputStream(org.xerial.snappy.SnappyFramedInputStream) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) InputStream(java.io.InputStream) SnappyFramedInputStream(org.xerial.snappy.SnappyFramedInputStream) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) ComponentLog(org.apache.nifi.logging.ComponentLog) StreamCallback(org.apache.nifi.processor.io.StreamCallback) ProcessException(org.apache.nifi.processor.exception.ProcessException) IOException(java.io.IOException) StopWatch(org.apache.nifi.util.StopWatch) ProcessException(org.apache.nifi.processor.exception.ProcessException) LZMA2Options(org.tukaani.xz.LZMA2Options) SnappyOutputStream(org.xerial.snappy.SnappyOutputStream)

Example 35 with GzipCompressorInputStream

use of org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream in project mmtf-spark by sbl-sdsc.

the class JpredDataset method getDataset.

/**
 * Gets JPred 4/JNet (v.2.3.1) secondary structure dataset.
 *
 * @return secondary structure dataset
 * @throws IOException
 *             if file cannot be downloaded or read
 */
public static Dataset<Row> getDataset() throws IOException {
    List<Row> res = new ArrayList<Row>();
    URL u = new URL(URL);
    URLConnection conn = u.openConnection();
    InputStream in = conn.getInputStream();
    BufferedInputStream fin = new BufferedInputStream(in);
    GzipCompressorInputStream gzIn = new GzipCompressorInputStream(fin);
    TarArchiveInputStream tarIn = new TarArchiveInputStream(gzIn);
    TarArchiveEntry entry = null;
    Set<String> scopIDs = new HashSet<>();
    Map<String, String> sequences = new HashMap<String, String>();
    Map<String, String> secondaryStructures = new HashMap<String, String>();
    Map<String, String> trained = new HashMap<String, String>();
    while ((entry = (TarArchiveEntry) tarIn.getNextEntry()) != null) {
        if (entry.isDirectory()) {
            continue;
        }
        BufferedReader br = new BufferedReader(new InputStreamReader(tarIn));
        if (entry.getName().contains(".dssp")) {
            String scopID = br.readLine().substring(1);
            String secondaryStructure = br.readLine();
            secondaryStructure = secondaryStructure.replace("-", "C");
            secondaryStructures.put(scopID, secondaryStructure);
        } else if (entry.getName().contains(".fasta")) {
            String scopID = br.readLine().substring(1);
            String sequence = br.readLine();
            scopIDs.add(scopID);
            sequences.put(scopID, sequence);
            if (entry.getName().contains("training/"))
                trained.put(scopID, "true");
            else if (entry.getName().contains("blind/"))
                trained.put(scopID, "false");
        }
    }
    tarIn.close();
    Iterator<String> iter = scopIDs.iterator();
    while (iter.hasNext()) {
        String scopID = iter.next();
        res.add(RowFactory.create(scopID, sequences.get(scopID), secondaryStructures.get(scopID), trained.get(scopID)));
    }
    SparkSession spark = SparkSession.builder().getOrCreate();
    @SuppressWarnings("resource") JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
    JavaRDD<Row> data = sc.parallelize(res);
    return JavaRDDToDataset.getDataset(data, "scopID", "sequence", "secondaryStructure", "trained");
}
Also used : GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) SparkSession(org.apache.spark.sql.SparkSession) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) BufferedInputStream(java.io.BufferedInputStream) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) GzipCompressorInputStream(org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream) InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) URL(java.net.URL) URLConnection(java.net.URLConnection) TarArchiveEntry(org.apache.commons.compress.archivers.tar.TarArchiveEntry) TarArchiveInputStream(org.apache.commons.compress.archivers.tar.TarArchiveInputStream) BufferedInputStream(java.io.BufferedInputStream) BufferedReader(java.io.BufferedReader) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) HashSet(java.util.HashSet)

Aggregations

GzipCompressorInputStream (org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream)58 TarArchiveInputStream (org.apache.commons.compress.archivers.tar.TarArchiveInputStream)46 TarArchiveEntry (org.apache.commons.compress.archivers.tar.TarArchiveEntry)40 IOException (java.io.IOException)29 FileInputStream (java.io.FileInputStream)26 File (java.io.File)23 BufferedInputStream (java.io.BufferedInputStream)22 FileOutputStream (java.io.FileOutputStream)20 InputStream (java.io.InputStream)16 OutputStream (java.io.OutputStream)10 Path (java.nio.file.Path)9 ArrayList (java.util.ArrayList)8 BufferedOutputStream (java.io.BufferedOutputStream)7 ByteArrayInputStream (java.io.ByteArrayInputStream)7 BZip2CompressorInputStream (org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream)6 BufferedReader (java.io.BufferedReader)4 ByteArrayOutputStream (java.io.ByteArrayOutputStream)4 InputStreamReader (java.io.InputStreamReader)4 URL (java.net.URL)4 ArchiveEntry (org.apache.commons.compress.archivers.ArchiveEntry)4