Search in sources :

Example 11 with Compressor

use of org.apache.hadoop.io.compress.Compressor in project hadoop by apache.

the class TestZStandardCompressorDecompressor method testCompressingWithOneByteOutputBuffer.

@Test
public void testCompressingWithOneByteOutputBuffer() throws Exception {
    int uncompressedSize = (int) FileUtils.sizeOf(uncompressedFile);
    byte[] bytes = FileUtils.readFileToByteArray(uncompressedFile);
    assertEquals(uncompressedSize, bytes.length);
    Configuration conf = new Configuration();
    ZStandardCodec codec = new ZStandardCodec();
    codec.setConf(conf);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    Compressor compressor = new ZStandardCompressor(3, IO_FILE_BUFFER_SIZE_DEFAULT, 1);
    CompressionOutputStream outputStream = codec.createOutputStream(baos, compressor);
    for (byte aByte : bytes) {
        outputStream.write(aByte);
    }
    outputStream.finish();
    outputStream.close();
    assertEquals(uncompressedSize, compressor.getBytesRead());
    assertTrue(compressor.finished());
    // just make sure we can decompress the file
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
    Decompressor decompressor = codec.createDecompressor();
    CompressionInputStream inputStream = codec.createInputStream(bais, decompressor);
    byte[] buffer = new byte[100];
    int n = buffer.length;
    while ((n = inputStream.read(buffer, 0, n)) != -1) {
        byteArrayOutputStream.write(buffer, 0, n);
    }
    assertArrayEquals(bytes, byteArrayOutputStream.toByteArray());
}
Also used : CompressionOutputStream(org.apache.hadoop.io.compress.CompressionOutputStream) Decompressor(org.apache.hadoop.io.compress.Decompressor) Configuration(org.apache.hadoop.conf.Configuration) ByteArrayInputStream(java.io.ByteArrayInputStream) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) Compressor(org.apache.hadoop.io.compress.Compressor) ZStandardCodec(org.apache.hadoop.io.compress.ZStandardCodec) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Test(org.junit.Test)

Example 12 with Compressor

use of org.apache.hadoop.io.compress.Compressor in project hadoop by apache.

the class Anonymizer method createJsonGenerator.

// Creates a JSON generator
private JsonGenerator createJsonGenerator(Configuration conf, Path path) throws IOException {
    FileSystem outFS = path.getFileSystem(conf);
    CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(path);
    OutputStream output;
    Compressor compressor = null;
    if (codec != null) {
        compressor = CodecPool.getCompressor(codec);
        output = codec.createOutputStream(outFS.create(path), compressor);
    } else {
        output = outFS.create(path);
    }
    JsonGenerator outGen = outFactory.createGenerator(output, JsonEncoding.UTF8);
    outGen.useDefaultPrettyPrinter();
    return outGen;
}
Also used : CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) OutputStream(java.io.OutputStream) Compressor(org.apache.hadoop.io.compress.Compressor) JsonGenerator(com.fasterxml.jackson.core.JsonGenerator) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 13 with Compressor

use of org.apache.hadoop.io.compress.Compressor in project mongo-hadoop by mongodb.

the class BSONSplitter method run.

/**
     * When run as a Tool, BSONSplitter can be used to pre-split and compress
     * BSON files. This can be especially useful before uploading large BSON
     * files to HDFS to save time. The compressed splits are written to the
     * given output path or to the directory containing the input file, if
     * the output path is unspecified. A ".splits" file is not generated, since
     * each output file is expected to be its own split.
     *
     * @param args command-line arguments. Run with zero arguments to see usage.
     * @return exit status
     * @throws Exception
     */
@Override
public int run(final String[] args) throws Exception {
    if (args.length < 1) {
        printUsage();
        return 1;
    }
    // Parse command-line arguments.
    Path filePath = new Path(args[0]);
    String compressorName = null, outputDirectoryStr = null;
    Path outputDirectory;
    CompressionCodec codec;
    Compressor compressor;
    for (int i = 1; i < args.length; ++i) {
        if ("-c".equals(args[i]) && args.length > i) {
            compressorName = args[++i];
        } else if ("-o".equals(args[i]) && args.length > i) {
            outputDirectoryStr = args[++i];
        } else {
            // CHECKSTYLE:OFF
            System.err.println("unrecognized option: " + args[i]);
            // CHECKSTYLE:ON
            printUsage();
            return 1;
        }
    }
    // Supply default values for unspecified arguments.
    if (null == outputDirectoryStr) {
        outputDirectory = filePath.getParent();
    } else {
        outputDirectory = new Path(outputDirectoryStr);
    }
    if (null == compressorName) {
        codec = new DefaultCodec();
    } else {
        Class<?> codecClass = Class.forName(compressorName);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, getConf());
    }
    if (codec instanceof Configurable) {
        ((Configurable) codec).setConf(getConf());
    }
    // Do not write a .splits file so as not to confuse BSONSplitter.
    // Each compressed file will be its own split.
    MongoConfigUtil.setBSONWriteSplits(getConf(), false);
    // Open the file.
    FileSystem inputFS = FileSystem.get(filePath.toUri(), getConf());
    FileSystem outputFS = FileSystem.get(outputDirectory.toUri(), getConf());
    FSDataInputStream inputStream = inputFS.open(filePath);
    // Use BSONSplitter to split the file.
    Path splitFilePath = getSplitsFilePath(filePath, getConf());
    try {
        loadSplitsFromSplitFile(inputFS.getFileStatus(filePath), splitFilePath);
    } catch (NoSplitFileException e) {
        LOG.info("did not find .splits file in " + splitFilePath.toUri());
        setInputPath(filePath);
        readSplits();
    }
    List<BSONFileSplit> splits = getAllSplits();
    LOG.info("compressing " + splits.size() + " splits.");
    byte[] buf = new byte[1024 * 1024];
    for (int i = 0; i < splits.size(); ++i) {
        // e.g., hdfs:///user/hive/warehouse/mongo/OutputFile-42.bz2
        Path splitOutputPath = new Path(outputDirectory, filePath.getName() + "-" + i + codec.getDefaultExtension());
        // Compress the split into a new file.
        compressor = CodecPool.getCompressor(codec);
        CompressionOutputStream compressionOutputStream = null;
        try {
            compressionOutputStream = codec.createOutputStream(outputFS.create(splitOutputPath), compressor);
            int totalBytes = 0, bytesRead = 0;
            BSONFileSplit split = splits.get(i);
            inputStream.seek(split.getStart());
            LOG.info("writing " + splitOutputPath.toUri() + ".");
            while (totalBytes < split.getLength() && bytesRead >= 0) {
                bytesRead = inputStream.read(buf, 0, (int) Math.min(buf.length, split.getLength() - totalBytes));
                if (bytesRead > 0) {
                    compressionOutputStream.write(buf, 0, bytesRead);
                    totalBytes += bytesRead;
                }
            }
        } finally {
            if (compressionOutputStream != null) {
                compressionOutputStream.close();
            }
            CodecPool.returnCompressor(compressor);
        }
    }
    LOG.info("done.");
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionOutputStream(org.apache.hadoop.io.compress.CompressionOutputStream) BSONFileSplit(com.mongodb.hadoop.input.BSONFileSplit) Compressor(org.apache.hadoop.io.compress.Compressor) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) Configurable(org.apache.hadoop.conf.Configurable) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Aggregations

Compressor (org.apache.hadoop.io.compress.Compressor)13 Configuration (org.apache.hadoop.conf.Configuration)5 Decompressor (org.apache.hadoop.io.compress.Decompressor)5 Test (org.junit.Test)5 ByteArrayOutputStream (java.io.ByteArrayOutputStream)3 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)3 CompressionOutputStream (org.apache.hadoop.io.compress.CompressionOutputStream)3 ZlibDirectDecompressor (org.apache.hadoop.io.compress.zlib.ZlibDecompressor.ZlibDirectDecompressor)3 ByteArrayInputStream (java.io.ByteArrayInputStream)2 IOException (java.io.IOException)2 OutputStream (java.io.OutputStream)2 Configurable (org.apache.hadoop.conf.Configurable)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 DoNotRetryIOException (org.apache.hadoop.hbase.DoNotRetryIOException)2 CompressionInputStream (org.apache.hadoop.io.compress.CompressionInputStream)2 ZStandardCodec (org.apache.hadoop.io.compress.ZStandardCodec)2 JsonGenerator (com.fasterxml.jackson.core.JsonGenerator)1 BSONFileSplit (com.mongodb.hadoop.input.BSONFileSplit)1 DataOutputStream (java.io.DataOutputStream)1 BufferOverflowException (java.nio.BufferOverflowException)1