Search in sources :

Example 6 with Compressor

use of org.apache.hadoop.io.compress.Compressor in project hbase by apache.

the class TestHFileBlock method createTestV1Block.

public byte[] createTestV1Block(Compression.Algorithm algo) throws IOException {
    Compressor compressor = algo.getCompressor();
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    OutputStream os = algo.createCompressionStream(baos, compressor, 0);
    DataOutputStream dos = new DataOutputStream(os);
    // Let's make this a meta block.
    BlockType.META.write(dos);
    writeTestBlockContents(dos);
    dos.flush();
    algo.returnCompressor(compressor);
    return baos.toByteArray();
}
Also used : DataOutputStream(java.io.DataOutputStream) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) DataOutputStream(java.io.DataOutputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) OutputStream(java.io.OutputStream) Compressor(org.apache.hadoop.io.compress.Compressor) ByteArrayOutputStream(java.io.ByteArrayOutputStream)

Example 7 with Compressor

use of org.apache.hadoop.io.compress.Compressor in project mongo-hadoop by mongodb.

the class BSONSplitter method run.

/**
     * When run as a Tool, BSONSplitter can be used to pre-split and compress
     * BSON files. This can be especially useful before uploading large BSON
     * files to HDFS to save time. The compressed splits are written to the
     * given output path or to the directory containing the input file, if
     * the output path is unspecified. A ".splits" file is not generated, since
     * each output file is expected to be its own split.
     *
     * @param args command-line arguments. Run with zero arguments to see usage.
     * @return exit status
     * @throws Exception
     */
@Override
public int run(final String[] args) throws Exception {
    if (args.length < 1) {
        printUsage();
        return 1;
    }
    // Parse command-line arguments.
    Path filePath = new Path(args[0]);
    String compressorName = null, outputDirectoryStr = null;
    Path outputDirectory;
    CompressionCodec codec;
    Compressor compressor;
    for (int i = 1; i < args.length; ++i) {
        if ("-c".equals(args[i]) && args.length > i) {
            compressorName = args[++i];
        } else if ("-o".equals(args[i]) && args.length > i) {
            outputDirectoryStr = args[++i];
        } else {
            // CHECKSTYLE:OFF
            System.err.println("unrecognized option: " + args[i]);
            // CHECKSTYLE:ON
            printUsage();
            return 1;
        }
    }
    // Supply default values for unspecified arguments.
    if (null == outputDirectoryStr) {
        outputDirectory = filePath.getParent();
    } else {
        outputDirectory = new Path(outputDirectoryStr);
    }
    if (null == compressorName) {
        codec = new DefaultCodec();
    } else {
        Class<?> codecClass = Class.forName(compressorName);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, getConf());
    }
    if (codec instanceof Configurable) {
        ((Configurable) codec).setConf(getConf());
    }
    // Do not write a .splits file so as not to confuse BSONSplitter.
    // Each compressed file will be its own split.
    MongoConfigUtil.setBSONWriteSplits(getConf(), false);
    // Open the file.
    FileSystem inputFS = FileSystem.get(filePath.toUri(), getConf());
    FileSystem outputFS = FileSystem.get(outputDirectory.toUri(), getConf());
    FSDataInputStream inputStream = inputFS.open(filePath);
    // Use BSONSplitter to split the file.
    Path splitFilePath = getSplitsFilePath(filePath, getConf());
    try {
        loadSplitsFromSplitFile(inputFS.getFileStatus(filePath), splitFilePath);
    } catch (NoSplitFileException e) {
        LOG.info("did not find .splits file in " + splitFilePath.toUri());
        setInputPath(filePath);
        readSplits();
    }
    List<BSONFileSplit> splits = getAllSplits();
    LOG.info("compressing " + splits.size() + " splits.");
    byte[] buf = new byte[1024 * 1024];
    for (int i = 0; i < splits.size(); ++i) {
        // e.g., hdfs:///user/hive/warehouse/mongo/OutputFile-42.bz2
        Path splitOutputPath = new Path(outputDirectory, filePath.getName() + "-" + i + codec.getDefaultExtension());
        // Compress the split into a new file.
        compressor = CodecPool.getCompressor(codec);
        CompressionOutputStream compressionOutputStream = null;
        try {
            compressionOutputStream = codec.createOutputStream(outputFS.create(splitOutputPath), compressor);
            int totalBytes = 0, bytesRead = 0;
            BSONFileSplit split = splits.get(i);
            inputStream.seek(split.getStart());
            LOG.info("writing " + splitOutputPath.toUri() + ".");
            while (totalBytes < split.getLength() && bytesRead >= 0) {
                bytesRead = inputStream.read(buf, 0, (int) Math.min(buf.length, split.getLength() - totalBytes));
                if (bytesRead > 0) {
                    compressionOutputStream.write(buf, 0, bytesRead);
                    totalBytes += bytesRead;
                }
            }
        } finally {
            if (compressionOutputStream != null) {
                compressionOutputStream.close();
            }
            CodecPool.returnCompressor(compressor);
        }
    }
    LOG.info("done.");
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionOutputStream(org.apache.hadoop.io.compress.CompressionOutputStream) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit) Compressor(org.apache.hadoop.io.compress.Compressor) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) Configurable(org.apache.hadoop.conf.Configurable) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 8 with Compressor

use of org.apache.hadoop.io.compress.Compressor in project hadoop by apache.

the class TestZlibCompressorDecompressor method testZlibCompressorDecompressorSetDictionary.

@Test
public void testZlibCompressorDecompressorSetDictionary() {
    Configuration conf = new Configuration();
    if (ZlibFactory.isNativeZlibLoaded(conf)) {
        Compressor zlibCompressor = ZlibFactory.getZlibCompressor(conf);
        Decompressor zlibDecompressor = ZlibFactory.getZlibDecompressor(conf);
        checkSetDictionaryNullPointerException(zlibCompressor);
        checkSetDictionaryNullPointerException(zlibDecompressor);
        checkSetDictionaryArrayIndexOutOfBoundsException(zlibDecompressor);
        checkSetDictionaryArrayIndexOutOfBoundsException(zlibCompressor);
    } else {
        assertTrue("ZlibFactory is using native libs against request", ZlibFactory.isNativeZlibLoaded(conf));
    }
}
Also used : ZlibDirectDecompressor(org.apache.hadoop.io.compress.zlib.ZlibDecompressor.ZlibDirectDecompressor) Decompressor(org.apache.hadoop.io.compress.Decompressor) Configuration(org.apache.hadoop.conf.Configuration) Compressor(org.apache.hadoop.io.compress.Compressor) Test(org.junit.Test)

Example 9 with Compressor

use of org.apache.hadoop.io.compress.Compressor in project hadoop by apache.

the class TestZlibCompressorDecompressor method testZlibCompressorDecompressorWithConfiguration.

@Test
public void testZlibCompressorDecompressorWithConfiguration() {
    Configuration conf = new Configuration();
    if (ZlibFactory.isNativeZlibLoaded(conf)) {
        byte[] rawData;
        int tryNumber = 5;
        int BYTE_SIZE = 10 * 1024;
        Compressor zlibCompressor = ZlibFactory.getZlibCompressor(conf);
        Decompressor zlibDecompressor = ZlibFactory.getZlibDecompressor(conf);
        rawData = generate(BYTE_SIZE);
        try {
            for (int i = 0; i < tryNumber; i++) compressDecompressZlib(rawData, (ZlibCompressor) zlibCompressor, (ZlibDecompressor) zlibDecompressor);
            zlibCompressor.reinit(conf);
        } catch (Exception ex) {
            fail("testZlibCompressorDecompressorWithConfiguration ex error " + ex);
        }
    } else {
        assertTrue("ZlibFactory is using native libs against request", ZlibFactory.isNativeZlibLoaded(conf));
    }
}
Also used : ZlibDirectDecompressor(org.apache.hadoop.io.compress.zlib.ZlibDecompressor.ZlibDirectDecompressor) Decompressor(org.apache.hadoop.io.compress.Decompressor) Configuration(org.apache.hadoop.conf.Configuration) Compressor(org.apache.hadoop.io.compress.Compressor) IOException(java.io.IOException) Test(org.junit.Test)

Example 10 with Compressor

use of org.apache.hadoop.io.compress.Compressor in project hadoop by apache.

the class TestZStandardCompressorDecompressor method testCompressionCompressesCorrectly.

@Test
public void testCompressionCompressesCorrectly() throws Exception {
    int uncompressedSize = (int) FileUtils.sizeOf(uncompressedFile);
    byte[] bytes = FileUtils.readFileToByteArray(uncompressedFile);
    assertEquals(uncompressedSize, bytes.length);
    Configuration conf = new Configuration();
    ZStandardCodec codec = new ZStandardCodec();
    codec.setConf(conf);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    Compressor compressor = codec.createCompressor();
    CompressionOutputStream outputStream = codec.createOutputStream(baos, compressor);
    for (byte aByte : bytes) {
        outputStream.write(aByte);
    }
    outputStream.finish();
    outputStream.close();
    assertEquals(uncompressedSize, compressor.getBytesRead());
    assertTrue(compressor.finished());
    // just make sure we can decompress the file
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
    Decompressor decompressor = codec.createDecompressor();
    CompressionInputStream inputStream = codec.createInputStream(bais, decompressor);
    byte[] buffer = new byte[100];
    int n = buffer.length;
    while ((n = inputStream.read(buffer, 0, n)) != -1) {
        byteArrayOutputStream.write(buffer, 0, n);
    }
    assertArrayEquals(bytes, byteArrayOutputStream.toByteArray());
}
Also used : CompressionOutputStream(org.apache.hadoop.io.compress.CompressionOutputStream) Decompressor(org.apache.hadoop.io.compress.Decompressor) Configuration(org.apache.hadoop.conf.Configuration) ByteArrayInputStream(java.io.ByteArrayInputStream) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) Compressor(org.apache.hadoop.io.compress.Compressor) ZStandardCodec(org.apache.hadoop.io.compress.ZStandardCodec) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Test(org.junit.Test)

Aggregations

Compressor (org.apache.hadoop.io.compress.Compressor)13 Configuration (org.apache.hadoop.conf.Configuration)5 Decompressor (org.apache.hadoop.io.compress.Decompressor)5 Test (org.junit.Test)5 ByteArrayOutputStream (java.io.ByteArrayOutputStream)3 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)3 CompressionOutputStream (org.apache.hadoop.io.compress.CompressionOutputStream)3 ZlibDirectDecompressor (org.apache.hadoop.io.compress.zlib.ZlibDecompressor.ZlibDirectDecompressor)3 ByteArrayInputStream (java.io.ByteArrayInputStream)2 IOException (java.io.IOException)2 OutputStream (java.io.OutputStream)2 Configurable (org.apache.hadoop.conf.Configurable)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 DoNotRetryIOException (org.apache.hadoop.hbase.DoNotRetryIOException)2 CompressionInputStream (org.apache.hadoop.io.compress.CompressionInputStream)2 ZStandardCodec (org.apache.hadoop.io.compress.ZStandardCodec)2 JsonGenerator (com.fasterxml.jackson.core.JsonGenerator)1 BSONFileSplit (com.mongodb.hadoop.input.BSONFileSplit)1 DataOutputStream (java.io.DataOutputStream)1 BufferOverflowException (java.nio.BufferOverflowException)1