Search in sources :

Example 11 with CompressionOutputStream

use of org.apache.hadoop.io.compress.CompressionOutputStream in project hadoop by apache.

the class TestZStandardCompressorDecompressor method testCompressingWithOneByteOutputBuffer.

@Test
public void testCompressingWithOneByteOutputBuffer() throws Exception {
    int uncompressedSize = (int) FileUtils.sizeOf(uncompressedFile);
    byte[] bytes = FileUtils.readFileToByteArray(uncompressedFile);
    assertEquals(uncompressedSize, bytes.length);
    Configuration conf = new Configuration();
    ZStandardCodec codec = new ZStandardCodec();
    codec.setConf(conf);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    Compressor compressor = new ZStandardCompressor(3, IO_FILE_BUFFER_SIZE_DEFAULT, 1);
    CompressionOutputStream outputStream = codec.createOutputStream(baos, compressor);
    for (byte aByte : bytes) {
        outputStream.write(aByte);
    }
    outputStream.finish();
    outputStream.close();
    assertEquals(uncompressedSize, compressor.getBytesRead());
    assertTrue(compressor.finished());
    // just make sure we can decompress the file
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
    Decompressor decompressor = codec.createDecompressor();
    CompressionInputStream inputStream = codec.createInputStream(bais, decompressor);
    byte[] buffer = new byte[100];
    int n = buffer.length;
    while ((n = inputStream.read(buffer, 0, n)) != -1) {
        byteArrayOutputStream.write(buffer, 0, n);
    }
    assertArrayEquals(bytes, byteArrayOutputStream.toByteArray());
}
Also used : CompressionOutputStream(org.apache.hadoop.io.compress.CompressionOutputStream) Decompressor(org.apache.hadoop.io.compress.Decompressor) Configuration(org.apache.hadoop.conf.Configuration) ByteArrayInputStream(java.io.ByteArrayInputStream) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) Compressor(org.apache.hadoop.io.compress.Compressor) ZStandardCodec(org.apache.hadoop.io.compress.ZStandardCodec) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Test(org.junit.Test)

Example 12 with CompressionOutputStream

use of org.apache.hadoop.io.compress.CompressionOutputStream in project ignite by apache.

the class HadoopSnappyTest method checkSnappy.

/**
 * Internal check routine.
 *
 * @throws Throwable If failed.
 */
public static void checkSnappy() throws Throwable {
    try {
        byte[] expBytes = new byte[BYTE_SIZE];
        byte[] actualBytes = new byte[BYTE_SIZE];
        for (int i = 0; i < expBytes.length; i++) expBytes[i] = (byte) ThreadLocalRandom.current().nextInt(16);
        SnappyCodec codec = new SnappyCodec();
        codec.setConf(new Configuration());
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        try (CompressionOutputStream cos = codec.createOutputStream(baos)) {
            cos.write(expBytes);
            cos.flush();
        }
        try (CompressionInputStream cis = codec.createInputStream(new ByteArrayInputStream(baos.toByteArray()))) {
            int read = cis.read(actualBytes, 0, actualBytes.length);
            assert read == actualBytes.length;
        }
        assert Arrays.equals(expBytes, actualBytes);
    } catch (Throwable e) {
        System.out.println("Snappy check failed:");
        System.out.println("### NativeCodeLoader.isNativeCodeLoaded:  " + NativeCodeLoader.isNativeCodeLoaded());
        System.out.println("### SnappyCompressor.isNativeCodeLoaded:  " + SnappyCompressor.isNativeCodeLoaded());
        throw e;
    }
}
Also used : CompressionOutputStream(org.apache.hadoop.io.compress.CompressionOutputStream) Configuration(org.apache.hadoop.conf.Configuration) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) SnappyCodec(org.apache.hadoop.io.compress.SnappyCodec)

Example 13 with CompressionOutputStream

use of org.apache.hadoop.io.compress.CompressionOutputStream in project mongo-hadoop by mongodb.

the class BSONSplitter method run.

/**
 * When run as a Tool, BSONSplitter can be used to pre-split and compress
 * BSON files. This can be especially useful before uploading large BSON
 * files to HDFS to save time. The compressed splits are written to the
 * given output path or to the directory containing the input file, if
 * the output path is unspecified. A ".splits" file is not generated, since
 * each output file is expected to be its own split.
 *
 * @param args command-line arguments. Run with zero arguments to see usage.
 * @return exit status
 * @throws Exception
 */
@Override
public int run(final String[] args) throws Exception {
    if (args.length < 1) {
        printUsage();
        return 1;
    }
    // Parse command-line arguments.
    Path filePath = new Path(args[0]);
    String compressorName = null, outputDirectoryStr = null;
    Path outputDirectory;
    CompressionCodec codec;
    Compressor compressor;
    for (int i = 1; i < args.length; ++i) {
        if ("-c".equals(args[i]) && args.length > i) {
            compressorName = args[++i];
        } else if ("-o".equals(args[i]) && args.length > i) {
            outputDirectoryStr = args[++i];
        } else {
            // CHECKSTYLE:OFF
            System.err.println("unrecognized option: " + args[i]);
            // CHECKSTYLE:ON
            printUsage();
            return 1;
        }
    }
    // Supply default values for unspecified arguments.
    if (null == outputDirectoryStr) {
        outputDirectory = filePath.getParent();
    } else {
        outputDirectory = new Path(outputDirectoryStr);
    }
    if (null == compressorName) {
        codec = new DefaultCodec();
    } else {
        Class<?> codecClass = Class.forName(compressorName);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, getConf());
    }
    if (codec instanceof Configurable) {
        ((Configurable) codec).setConf(getConf());
    }
    // Do not write a .splits file so as not to confuse BSONSplitter.
    // Each compressed file will be its own split.
    MongoConfigUtil.setBSONWriteSplits(getConf(), false);
    // Open the file.
    FileSystem inputFS = FileSystem.get(filePath.toUri(), getConf());
    FileSystem outputFS = FileSystem.get(outputDirectory.toUri(), getConf());
    FSDataInputStream inputStream = inputFS.open(filePath);
    // Use BSONSplitter to split the file.
    Path splitFilePath = getSplitsFilePath(filePath, getConf());
    try {
        loadSplitsFromSplitFile(inputFS.getFileStatus(filePath), splitFilePath);
    } catch (NoSplitFileException e) {
        LOG.info("did not find .splits file in " + splitFilePath.toUri());
        setInputPath(filePath);
        readSplits();
    }
    List<BSONFileSplit> splits = getAllSplits();
    LOG.info("compressing " + splits.size() + " splits.");
    byte[] buf = new byte[1024 * 1024];
    for (int i = 0; i < splits.size(); ++i) {
        // e.g., hdfs:///user/hive/warehouse/mongo/OutputFile-42.bz2
        Path splitOutputPath = new Path(outputDirectory, filePath.getName() + "-" + i + codec.getDefaultExtension());
        // Compress the split into a new file.
        compressor = CodecPool.getCompressor(codec);
        CompressionOutputStream compressionOutputStream = null;
        try {
            compressionOutputStream = codec.createOutputStream(outputFS.create(splitOutputPath), compressor);
            int totalBytes = 0, bytesRead = 0;
            BSONFileSplit split = splits.get(i);
            inputStream.seek(split.getStart());
            LOG.info("writing " + splitOutputPath.toUri() + ".");
            while (totalBytes < split.getLength() && bytesRead >= 0) {
                bytesRead = inputStream.read(buf, 0, (int) Math.min(buf.length, split.getLength() - totalBytes));
                if (bytesRead > 0) {
                    compressionOutputStream.write(buf, 0, bytesRead);
                    totalBytes += bytesRead;
                }
            }
        } finally {
            if (compressionOutputStream != null) {
                compressionOutputStream.close();
            }
            CodecPool.returnCompressor(compressor);
        }
    }
    LOG.info("done.");
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionOutputStream(org.apache.hadoop.io.compress.CompressionOutputStream) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit) Compressor(org.apache.hadoop.io.compress.Compressor) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) Configurable(org.apache.hadoop.conf.Configurable) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 14 with CompressionOutputStream

use of org.apache.hadoop.io.compress.CompressionOutputStream in project hbase by apache.

the class DataBlockEncodingTool method benchmarkAlgorithm.

/**
 * Check decompress performance of a given algorithm and print it.
 * @param algorithm Compression algorithm.
 * @param name Name of algorithm.
 * @param buffer Buffer to be compressed.
 * @param offset Position of the beginning of the data.
 * @param length Length of data in buffer.
 * @throws IOException
 */
public void benchmarkAlgorithm(Compression.Algorithm algorithm, String name, byte[] buffer, int offset, int length) throws IOException {
    System.out.println(name + ":");
    // compress it
    List<Long> compressDurations = new ArrayList<>();
    ByteArrayOutputStream compressedStream = new ByteArrayOutputStream();
    CompressionOutputStream compressingStream = algorithm.createPlainCompressionStream(compressedStream, compressor);
    try {
        for (int itTime = 0; itTime < benchmarkNTimes; ++itTime) {
            final long startTime = System.nanoTime();
            // The compressedStream should reset before compressingStream resetState since in GZ
            // resetStatue will write header in the outputstream.
            compressedStream.reset();
            compressingStream.resetState();
            compressingStream.write(buffer, offset, length);
            compressingStream.flush();
            compressedStream.toByteArray();
            final long finishTime = System.nanoTime();
            // add time record
            if (itTime >= benchmarkNOmit) {
                compressDurations.add(finishTime - startTime);
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(String.format("Benchmark, or encoding algorithm '%s' cause some stream problems", name), e);
    }
    compressingStream.close();
    printBenchmarkResult(length, compressDurations, Manipulation.COMPRESSION);
    byte[] compBuffer = compressedStream.toByteArray();
    // uncompress it several times and measure performance
    List<Long> durations = new ArrayList<>();
    for (int itTime = 0; itTime < benchmarkNTimes; ++itTime) {
        final long startTime = System.nanoTime();
        byte[] newBuf = new byte[length + 1];
        try {
            ByteArrayInputStream downStream = new ByteArrayInputStream(compBuffer, 0, compBuffer.length);
            InputStream decompressedStream = algorithm.createDecompressionStream(downStream, decompressor, 0);
            int destOffset = 0;
            int nextChunk;
            while ((nextChunk = decompressedStream.available()) > 0) {
                destOffset += decompressedStream.read(newBuf, destOffset, nextChunk);
            }
            decompressedStream.close();
        } catch (IOException e) {
            throw new RuntimeException(String.format("Decoding path in '%s' algorithm cause exception ", name), e);
        }
        final long finishTime = System.nanoTime();
        // check correctness
        if (0 != Bytes.compareTo(buffer, 0, length, newBuf, 0, length)) {
            int prefix = 0;
            for (; prefix < buffer.length && prefix < newBuf.length; ++prefix) {
                if (buffer[prefix] != newBuf[prefix]) {
                    break;
                }
            }
            throw new RuntimeException(String.format("Algorithm '%s' is corrupting the data", name));
        }
        // add time record
        if (itTime >= benchmarkNOmit) {
            durations.add(finishTime - startTime);
        }
    }
    printBenchmarkResult(length, durations, Manipulation.DECOMPRESSION);
    System.out.println();
}
Also used : CompressionOutputStream(org.apache.hadoop.io.compress.CompressionOutputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) ByteArrayOutputStream(java.io.ByteArrayOutputStream) IOException(java.io.IOException)

Example 15 with CompressionOutputStream

use of org.apache.hadoop.io.compress.CompressionOutputStream in project brisk by riptano.

the class CompressionTests method testSnappyCompression.

@Test
public void testSnappyCompression() throws IOException {
    SnappyCodec c = new SnappyCodec(new Configuration());
    byte[] inmsg = new byte[1024 * 1024 * 10];
    fillArray(inmsg);
    byte[] buffer = new byte[1024 * 1024];
    byte[] outmsg = new byte[1024 * 1024 * 16];
    for (int k = 0; k < 64; k++) {
        ByteArrayOutputStream bout = new ByteArrayOutputStream();
        CompressionOutputStream cout = c.createOutputStream(bout);
        cout.write(inmsg);
        cout.flush();
        ByteArrayInputStream bin = new ByteArrayInputStream(bout.toByteArray());
        CompressionInputStream cin = c.createInputStream(bin);
        int totaln = 0;
        while (cin.available() > 0) {
            int n = cin.read(buffer);
            if (n < 0)
                break;
            try {
                System.arraycopy(buffer, 0, outmsg, totaln, n);
            } catch (Throwable t) {
                System.err.println("n = " + n + " totaln " + totaln);
                throw new RuntimeException(t);
            }
            totaln += n;
        }
        assertEquals(inmsg.length, totaln);
        for (int i = 0; i < inmsg.length; i++) {
            assertEquals(inmsg[i], outmsg[i]);
        }
        assertEquals(new String(inmsg), new String(outmsg, 0, totaln));
    }
}
Also used : CompressionOutputStream(org.apache.hadoop.io.compress.CompressionOutputStream) Configuration(org.apache.hadoop.conf.Configuration) ByteArrayInputStream(java.io.ByteArrayInputStream) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Test(org.junit.Test)

Aggregations

CompressionOutputStream (org.apache.hadoop.io.compress.CompressionOutputStream)15 CompressionInputStream (org.apache.hadoop.io.compress.CompressionInputStream)9 ByteArrayOutputStream (java.io.ByteArrayOutputStream)8 Configuration (org.apache.hadoop.conf.Configuration)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 Test (org.junit.Test)7 BufferedOutputStream (java.io.BufferedOutputStream)5 DataOutputStream (java.io.DataOutputStream)5 IOException (java.io.IOException)4 DataOutputBuffer (org.apache.hadoop.io.DataOutputBuffer)4 Compressor (org.apache.hadoop.io.compress.Compressor)4 BufferedInputStream (java.io.BufferedInputStream)3 DataInputStream (java.io.DataInputStream)3 DataInputBuffer (org.apache.hadoop.io.DataInputBuffer)3 Decompressor (org.apache.hadoop.io.compress.Decompressor)3 SnappyCodec (org.apache.hadoop.io.compress.SnappyCodec)3 File (java.io.File)2 FileInputStream (java.io.FileInputStream)2 FileOutputStream (java.io.FileOutputStream)2 BZip2Codec (org.apache.hadoop.io.compress.BZip2Codec)2