Search in sources :

Example 1 with CompressionInputStream

use of org.apache.hadoop.io.compress.CompressionInputStream in project hadoop by apache.

the class CompressionEmulationUtil method getPossiblyDecompressedInputStream.

/**
   * Returns a {@link InputStream} for a file that might be compressed.
   */
static InputStream getPossiblyDecompressedInputStream(Path file, Configuration conf, long offset) throws IOException {
    FileSystem fs = file.getFileSystem(conf);
    if (isCompressionEmulationEnabled(conf) && isInputCompressionEmulationEnabled(conf)) {
        CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodecs.getCodec(file);
        if (codec != null) {
            Decompressor decompressor = CodecPool.getDecompressor(codec);
            if (decompressor != null) {
                CompressionInputStream in = codec.createInputStream(fs.open(file), decompressor);
                //     Use SplittableCompressionCodec?
                return (InputStream) in;
            }
        }
    }
    FSDataInputStream in = fs.open(file);
    in.seek(offset);
    return (InputStream) in;
}
Also used : Decompressor(org.apache.hadoop.io.compress.Decompressor) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) InputStream(java.io.InputStream) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 2 with CompressionInputStream

use of org.apache.hadoop.io.compress.CompressionInputStream in project hadoop by apache.

the class TestConcatenatedCompressedInput method testBuiltInGzipDecompressor.

/**
   * Test using the new BuiltInGzipDecompressor codec for reading gzip files.
   */
// NOTE:  This fails on RHEL4 with "java.io.IOException: header crc mismatch"
//        due to buggy version of zlib (1.2.1.2) included.
@Test
public void testBuiltInGzipDecompressor() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);
    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, jobConf);
    localFs.delete(workDir, true);
    // Don't use native libs for this test
    ZlibFactory.setNativeZlibLoaded(false);
    assertEquals("[non-native (Java) codec]", org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class, gzip.getDecompressorType());
    System.out.println(COLOR_BR_YELLOW + "testBuiltInGzipDecompressor() using" + " non-native (Java Inflater) Decompressor (" + gzip.getDecompressorType() + ")" + COLOR_NORMAL);
    // copy single-member test file to HDFS
    String fn1 = "testConcatThenCompress.txt" + gzip.getDefaultExtension();
    Path fnLocal1 = new Path(System.getProperty("test.concat.data", "/tmp"), fn1);
    Path fnHDFS1 = new Path(workDir, fn1);
    localFs.copyFromLocalFile(fnLocal1, fnHDFS1);
    // copy multiple-member test file to HDFS
    // (actually in "seekable gzip" format, a la JIRA PIG-42)
    String fn2 = "testCompressThenConcat.txt" + gzip.getDefaultExtension();
    Path fnLocal2 = new Path(System.getProperty("test.concat.data", "/tmp"), fn2);
    Path fnHDFS2 = new Path(workDir, fn2);
    localFs.copyFromLocalFile(fnLocal2, fnHDFS2);
    FileInputFormat.setInputPaths(jobConf, workDir);
    // here's first pair of DecompressorStreams:
    final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
    final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
    assertEquals("concat bytes available", 2734, in1.available());
    // w/hdr CRC
    assertEquals("concat bytes available", 3413, in2.available());
    CompressionInputStream cin2 = gzip.createInputStream(in2);
    LineReader in = new LineReader(cin2);
    Text out = new Text();
    int numBytes, totalBytes = 0, lineNum = 0;
    while ((numBytes = in.readLine(out)) > 0) {
        ++lineNum;
        totalBytes += numBytes;
    }
    in.close();
    assertEquals("total uncompressed bytes in concatenated test file", 5346, totalBytes);
    assertEquals("total uncompressed lines in concatenated test file", 84, lineNum);
    ZlibFactory.loadNativeZLib();
    // test GzipZlibDecompressor (native), just to be sure
    // (FIXME?  could move this call to testGzip(), but would need filename
    // setup above) (alternatively, maybe just nuke testGzip() and extend this?)
    doMultipleGzipBufferSizes(jobConf, true);
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) Text(org.apache.hadoop.io.Text) FileInputStream(java.io.FileInputStream) LineReader(org.apache.hadoop.util.LineReader) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Test(org.junit.Test)

Example 3 with CompressionInputStream

use of org.apache.hadoop.io.compress.CompressionInputStream in project hadoop by apache.

the class FixedLengthRecordReader method initialize.

// This is also called from the old FixedLengthRecordReader API implementation
public void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException {
    start = splitStart;
    end = start + splitLength;
    long partialRecordLength = start % recordLength;
    long numBytesToSkip = 0;
    if (partialRecordLength != 0) {
        numBytesToSkip = recordLength - partialRecordLength;
    }
    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor);
        filePosition = cIn;
        inputStream = cIn;
        numRecordsRemainingInSplit = Long.MAX_VALUE;
        LOG.info("Compressed input; cannot compute number of records in the split");
    } else {
        fileIn.seek(start);
        filePosition = fileIn;
        inputStream = fileIn;
        long splitSize = end - start - numBytesToSkip;
        numRecordsRemainingInSplit = (splitSize + recordLength - 1) / recordLength;
        if (numRecordsRemainingInSplit < 0) {
            numRecordsRemainingInSplit = 0;
        }
        LOG.info("Expecting " + numRecordsRemainingInSplit + " records each with a length of " + recordLength + " bytes in the split with an effective size of " + splitSize + " bytes");
    }
    if (numBytesToSkip != 0) {
        start += inputStream.skip(numBytesToSkip);
    }
    this.pos = start;
}
Also used : CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 4 with CompressionInputStream

use of org.apache.hadoop.io.compress.CompressionInputStream in project hadoop by apache.

the class TestTextOutputFormat method testCompress.

/**
   * test compressed file
   * @throws IOException
   */
@Test
public void testCompress() throws IOException {
    JobConf job = new JobConf();
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    job.set(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.COMPRESS, "true");
    FileOutputFormat.setOutputPath(job, workDir.getParent().getParent());
    FileOutputFormat.setWorkOutputPath(job, workDir);
    FileSystem fs = workDir.getFileSystem(job);
    if (!fs.mkdirs(workDir)) {
        fail("Failed to create output directory");
    }
    String file = "test_compress.txt";
    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;
    TextOutputFormat<Object, Object> theOutputFormat = new TextOutputFormat<Object, Object>();
    RecordWriter<Object, Object> theRecordWriter = theOutputFormat.getRecordWriter(localFs, job, file, reporter);
    Text key1 = new Text("key1");
    Text key2 = new Text("key2");
    Text val1 = new Text("val1");
    Text val2 = new Text("val2");
    NullWritable nullWritable = NullWritable.get();
    try {
        theRecordWriter.write(key1, val1);
        theRecordWriter.write(null, nullWritable);
        theRecordWriter.write(null, val1);
        theRecordWriter.write(nullWritable, val2);
        theRecordWriter.write(key2, nullWritable);
        theRecordWriter.write(key1, null);
        theRecordWriter.write(null, null);
        theRecordWriter.write(key2, val2);
    } finally {
        theRecordWriter.close(reporter);
    }
    StringBuffer expectedOutput = new StringBuffer();
    expectedOutput.append(key1).append("\t").append(val1).append("\n");
    expectedOutput.append(val1).append("\n");
    expectedOutput.append(val2).append("\n");
    expectedOutput.append(key2).append("\n");
    expectedOutput.append(key1).append("\n");
    expectedOutput.append(key2).append("\t").append(val2).append("\n");
    DefaultCodec codec = new DefaultCodec();
    codec.setConf(job);
    Path expectedFile = new Path(workDir, file + codec.getDefaultExtension());
    final FileInputStream istream = new FileInputStream(expectedFile.toString());
    CompressionInputStream cistream = codec.createInputStream(istream);
    LineReader reader = new LineReader(cistream);
    String output = "";
    Text out = new Text();
    while (reader.readLine(out) > 0) {
        output += out;
        output += "\n";
    }
    reader.close();
    assertEquals(expectedOutput.toString(), output);
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) Text(org.apache.hadoop.io.Text) NullWritable(org.apache.hadoop.io.NullWritable) FileInputStream(java.io.FileInputStream) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) Test(org.junit.Test)

Example 5 with CompressionInputStream

use of org.apache.hadoop.io.compress.CompressionInputStream in project hadoop by apache.

the class TestLz4CompressorDecompressor method testCompressorDecopressorLogicWithCompressionStreams.

// test compress/decompress process through CompressionOutputStream/CompressionInputStream api 
@Test
public void testCompressorDecopressorLogicWithCompressionStreams() {
    DataOutputStream deflateOut = null;
    DataInputStream inflateIn = null;
    int BYTE_SIZE = 1024 * 100;
    byte[] bytes = generate(BYTE_SIZE);
    int bufferSize = 262144;
    int compressionOverhead = (bufferSize / 6) + 32;
    try {
        DataOutputBuffer compressedDataBuffer = new DataOutputBuffer();
        CompressionOutputStream deflateFilter = new BlockCompressorStream(compressedDataBuffer, new Lz4Compressor(bufferSize), bufferSize, compressionOverhead);
        deflateOut = new DataOutputStream(new BufferedOutputStream(deflateFilter));
        deflateOut.write(bytes, 0, bytes.length);
        deflateOut.flush();
        deflateFilter.finish();
        DataInputBuffer deCompressedDataBuffer = new DataInputBuffer();
        deCompressedDataBuffer.reset(compressedDataBuffer.getData(), 0, compressedDataBuffer.getLength());
        CompressionInputStream inflateFilter = new BlockDecompressorStream(deCompressedDataBuffer, new Lz4Decompressor(bufferSize), bufferSize);
        inflateIn = new DataInputStream(new BufferedInputStream(inflateFilter));
        byte[] result = new byte[BYTE_SIZE];
        inflateIn.read(result);
        assertArrayEquals("original array not equals compress/decompressed array", result, bytes);
    } catch (IOException e) {
        fail("testLz4CompressorDecopressorLogicWithCompressionStreams ex error !!!");
    } finally {
        try {
            if (deflateOut != null)
                deflateOut.close();
            if (inflateIn != null)
                inflateIn.close();
        } catch (Exception e) {
        }
    }
}
Also used : CompressionOutputStream(org.apache.hadoop.io.compress.CompressionOutputStream) Lz4Compressor(org.apache.hadoop.io.compress.lz4.Lz4Compressor) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) DataOutputStream(java.io.DataOutputStream) BlockDecompressorStream(org.apache.hadoop.io.compress.BlockDecompressorStream) IOException(java.io.IOException) DataInputStream(java.io.DataInputStream) IOException(java.io.IOException) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) Lz4Decompressor(org.apache.hadoop.io.compress.lz4.Lz4Decompressor) BlockCompressorStream(org.apache.hadoop.io.compress.BlockCompressorStream) BufferedInputStream(java.io.BufferedInputStream) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) BufferedOutputStream(java.io.BufferedOutputStream) Test(org.junit.Test)

Aggregations

CompressionInputStream (org.apache.hadoop.io.compress.CompressionInputStream)20 Test (org.junit.Test)13 CompressionOutputStream (org.apache.hadoop.io.compress.CompressionOutputStream)9 ByteArrayOutputStream (java.io.ByteArrayOutputStream)8 ByteArrayInputStream (java.io.ByteArrayInputStream)7 Configuration (org.apache.hadoop.conf.Configuration)7 Decompressor (org.apache.hadoop.io.compress.Decompressor)7 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)6 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 BufferedOutputStream (java.io.BufferedOutputStream)4 ZStandardCodec (org.apache.hadoop.io.compress.ZStandardCodec)4 BufferedInputStream (java.io.BufferedInputStream)3 DataInputStream (java.io.DataInputStream)3 DataOutputStream (java.io.DataOutputStream)3 InputStream (java.io.InputStream)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 DataInputBuffer (org.apache.hadoop.io.DataInputBuffer)3 DataOutputBuffer (org.apache.hadoop.io.DataOutputBuffer)3 Compressor (org.apache.hadoop.io.compress.Compressor)3