Search in sources :

Example 1 with GzipCodec

use of org.apache.hadoop.io.compress.GzipCodec in project hadoop by apache.

the class TestSequenceFileAppend method testAppendRecordCompression.

@Test(timeout = 30000)
public void testAppendRecordCompression() throws Exception {
    GenericTestUtils.assumeInNativeProfile();
    Path file = new Path(ROOT_PATH, "testseqappendblockcompr.seq");
    fs.delete(file, true);
    Option compressOption = Writer.compression(CompressionType.RECORD, new GzipCodec());
    Writer writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), compressOption);
    writer.append(1L, "one");
    writer.append(2L, "two");
    writer.close();
    verify2Values(file);
    writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), compressOption);
    writer.append(3L, "three");
    writer.append(4L, "four");
    writer.close();
    verifyAll4Values(file);
    fs.deleteOnExit(file);
}
Also used : Path(org.apache.hadoop.fs.Path) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) Option(org.apache.hadoop.io.SequenceFile.Writer.Option) Writer(org.apache.hadoop.io.SequenceFile.Writer) Test(org.junit.Test)

Example 2 with GzipCodec

use of org.apache.hadoop.io.compress.GzipCodec in project hadoop by apache.

the class TestSequenceFileAppend method testAppendBlockCompression.

@Test(timeout = 30000)
public void testAppendBlockCompression() throws Exception {
    GenericTestUtils.assumeInNativeProfile();
    Path file = new Path(ROOT_PATH, "testseqappendblockcompr.seq");
    fs.delete(file, true);
    Option compressOption = Writer.compression(CompressionType.BLOCK, new GzipCodec());
    Writer writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), compressOption);
    writer.append(1L, "one");
    writer.append(2L, "two");
    writer.close();
    verify2Values(file);
    writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), compressOption);
    writer.append(3L, "three");
    writer.append(4L, "four");
    writer.close();
    verifyAll4Values(file);
    // Verify failure if the compression details are different or not Provided
    try {
        writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true));
        writer.close();
        fail("Expected IllegalArgumentException for compression options");
    } catch (IllegalArgumentException IAE) {
    // Expected exception. Ignore it
    }
    // Verify failure if the compression details are different
    try {
        Option wrongCompressOption = Writer.compression(CompressionType.RECORD, new GzipCodec());
        writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), wrongCompressOption);
        writer.close();
        fail("Expected IllegalArgumentException for compression options");
    } catch (IllegalArgumentException IAE) {
    // Expected exception. Ignore it
    }
    try {
        Option wrongCompressOption = Writer.compression(CompressionType.BLOCK, new DefaultCodec());
        writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), wrongCompressOption);
        writer.close();
        fail("Expected IllegalArgumentException for compression options");
    } catch (IllegalArgumentException IAE) {
    // Expected exception. Ignore it
    }
    fs.deleteOnExit(file);
}
Also used : Path(org.apache.hadoop.fs.Path) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) Option(org.apache.hadoop.io.SequenceFile.Writer.Option) Writer(org.apache.hadoop.io.SequenceFile.Writer) Test(org.junit.Test)

Example 3 with GzipCodec

use of org.apache.hadoop.io.compress.GzipCodec in project hadoop by apache.

the class TestCombineTextInputFormat method testGzip.

/**
   * Test using the gzip codec for reading
   */
@Test(timeout = 10000)
public void testGzip() throws IOException, InterruptedException {
    Configuration conf = new Configuration(defaultConf);
    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, conf);
    localFs.delete(workDir, true);
    writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n");
    Job job = Job.getInstance(conf);
    FileInputFormat.setInputPaths(job, workDir);
    CombineTextInputFormat format = new CombineTextInputFormat();
    List<InputSplit> splits = format.getSplits(job);
    assertEquals("compressed splits == 1", 1, splits.size());
    List<Text> results = readSplit(format, splits.get(0), job);
    assertEquals("splits[0] length", 8, results.size());
    final String[] firstList = { "the quick", "brown", "fox jumped", "over", " the lazy", " dog" };
    final String[] secondList = { "this is a test", "of gzip" };
    String first = results.get(0).toString();
    if (first.equals(firstList[0])) {
        testResults(results, firstList, secondList);
    } else if (first.equals(secondList[0])) {
        testResults(results, secondList, firstList);
    } else {
        fail("unexpected first token!");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) Text(org.apache.hadoop.io.Text) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 4 with GzipCodec

use of org.apache.hadoop.io.compress.GzipCodec in project hadoop by apache.

the class TestConcatenatedCompressedInput method testBuiltInGzipDecompressor.

/**
   * Test using the new BuiltInGzipDecompressor codec for reading gzip files.
   */
// NOTE:  This fails on RHEL4 with "java.io.IOException: header crc mismatch"
//        due to buggy version of zlib (1.2.1.2) included.
@Test
public void testBuiltInGzipDecompressor() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);
    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, jobConf);
    localFs.delete(workDir, true);
    // Don't use native libs for this test
    ZlibFactory.setNativeZlibLoaded(false);
    assertEquals("[non-native (Java) codec]", org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class, gzip.getDecompressorType());
    System.out.println(COLOR_BR_YELLOW + "testBuiltInGzipDecompressor() using" + " non-native (Java Inflater) Decompressor (" + gzip.getDecompressorType() + ")" + COLOR_NORMAL);
    // copy single-member test file to HDFS
    String fn1 = "testConcatThenCompress.txt" + gzip.getDefaultExtension();
    Path fnLocal1 = new Path(System.getProperty("test.concat.data", "/tmp"), fn1);
    Path fnHDFS1 = new Path(workDir, fn1);
    localFs.copyFromLocalFile(fnLocal1, fnHDFS1);
    // copy multiple-member test file to HDFS
    // (actually in "seekable gzip" format, a la JIRA PIG-42)
    String fn2 = "testCompressThenConcat.txt" + gzip.getDefaultExtension();
    Path fnLocal2 = new Path(System.getProperty("test.concat.data", "/tmp"), fn2);
    Path fnHDFS2 = new Path(workDir, fn2);
    localFs.copyFromLocalFile(fnLocal2, fnHDFS2);
    FileInputFormat.setInputPaths(jobConf, workDir);
    // here's first pair of DecompressorStreams:
    final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
    final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
    assertEquals("concat bytes available", 2734, in1.available());
    // w/hdr CRC
    assertEquals("concat bytes available", 3413, in2.available());
    CompressionInputStream cin2 = gzip.createInputStream(in2);
    LineReader in = new LineReader(cin2);
    Text out = new Text();
    int numBytes, totalBytes = 0, lineNum = 0;
    while ((numBytes = in.readLine(out)) > 0) {
        ++lineNum;
        totalBytes += numBytes;
    }
    in.close();
    assertEquals("total uncompressed bytes in concatenated test file", 5346, totalBytes);
    assertEquals("total uncompressed lines in concatenated test file", 84, lineNum);
    ZlibFactory.loadNativeZLib();
    // test GzipZlibDecompressor (native), just to be sure
    // (FIXME?  could move this call to testGzip(), but would need filename
    // setup above) (alternatively, maybe just nuke testGzip() and extend this?)
    doMultipleGzipBufferSizes(jobConf, true);
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) Text(org.apache.hadoop.io.Text) FileInputStream(java.io.FileInputStream) LineReader(org.apache.hadoop.util.LineReader) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Test(org.junit.Test)

Example 5 with GzipCodec

use of org.apache.hadoop.io.compress.GzipCodec in project hadoop by apache.

the class TestConcatenatedCompressedInput method testPrototypeInflaterGzip.

/**
   * Test using the raw Inflater codec for reading gzip files.
   */
@Test
public void testPrototypeInflaterGzip() throws IOException {
    // used only for file extension
    CompressionCodec gzip = new GzipCodec();
    // localFs = FileSystem instance
    localFs.delete(workDir, true);
    System.out.println(COLOR_BR_BLUE + "testPrototypeInflaterGzip() using " + "non-native/Java Inflater and manual gzip header/trailer parsing" + COLOR_NORMAL);
    // copy prebuilt (correct!) version of concat.gz to HDFS
    final String fn = "concat" + gzip.getDefaultExtension();
    Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
    Path fnHDFS = new Path(workDir, fn);
    localFs.copyFromLocalFile(fnLocal, fnHDFS);
    final FileInputStream in = new FileInputStream(fnLocal.toString());
    assertEquals("concat bytes available", 148, in.available());
    // should wrap all of this header-reading stuff in a running-CRC wrapper
    // (did so in BuiltInGzipDecompressor; see below)
    byte[] compressedBuf = new byte[256];
    int numBytesRead = in.read(compressedBuf, 0, 10);
    assertEquals("header bytes read", 10, numBytesRead);
    assertEquals("1st byte", 0x1f, compressedBuf[0] & 0xff);
    assertEquals("2nd byte", 0x8b, compressedBuf[1] & 0xff);
    assertEquals("3rd byte (compression method)", 8, compressedBuf[2] & 0xff);
    byte flags = (byte) (compressedBuf[3] & 0xff);
    if ((flags & 0x04) != 0) {
        // FEXTRA
        numBytesRead = in.read(compressedBuf, 0, 2);
        assertEquals("XLEN bytes read", 2, numBytesRead);
        int xlen = ((compressedBuf[1] << 8) | compressedBuf[0]) & 0xffff;
        in.skip(xlen);
    }
    if ((flags & 0x08) != 0) {
        // FNAME
        while ((numBytesRead = in.read()) != 0) {
            assertFalse("unexpected end-of-file while reading filename", numBytesRead == -1);
        }
    }
    if ((flags & 0x10) != 0) {
        // FCOMMENT
        while ((numBytesRead = in.read()) != 0) {
            assertFalse("unexpected end-of-file while reading comment", numBytesRead == -1);
        }
    }
    if ((flags & 0xe0) != 0) {
        // reserved
        assertTrue("reserved bits are set??", (flags & 0xe0) == 0);
    }
    if ((flags & 0x02) != 0) {
        // FHCRC
        numBytesRead = in.read(compressedBuf, 0, 2);
        assertEquals("CRC16 bytes read", 2, numBytesRead);
        int crc16 = ((compressedBuf[1] << 8) | compressedBuf[0]) & 0xffff;
    }
    // ready to go!  next bytes should be start of deflated stream, suitable
    // for Inflater
    numBytesRead = in.read(compressedBuf);
    // Inflater docs refer to a "dummy byte":  no clue what that's about;
    // appears to work fine without one
    byte[] uncompressedBuf = new byte[256];
    Inflater inflater = new Inflater(true);
    inflater.setInput(compressedBuf, 0, numBytesRead);
    try {
        int numBytesUncompressed = inflater.inflate(uncompressedBuf);
        String outString = new String(uncompressedBuf, 0, numBytesUncompressed, "UTF-8");
        System.out.println("uncompressed data of first gzip member = [" + outString + "]");
    } catch (java.util.zip.DataFormatException ex) {
        throw new IOException(ex.getMessage());
    }
    in.close();
}
Also used : Path(org.apache.hadoop.fs.Path) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) Inflater(java.util.zip.Inflater) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Test(org.junit.Test)

Aggregations

GzipCodec (org.apache.hadoop.io.compress.GzipCodec)15 Test (org.junit.Test)13 Path (org.apache.hadoop.fs.Path)12 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)7 Text (org.apache.hadoop.io.Text)6 Writer (org.apache.hadoop.io.SequenceFile.Writer)5 Option (org.apache.hadoop.io.SequenceFile.Writer.Option)5 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)5 Configuration (org.apache.hadoop.conf.Configuration)4 FileInputStream (java.io.FileInputStream)3 FileSystem (org.apache.hadoop.fs.FileSystem)2 LocalFileSystem (org.apache.hadoop.fs.LocalFileSystem)2 KeyValueCodec (org.apache.hadoop.hbase.codec.KeyValueCodec)2 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 Inflater (java.util.zip.Inflater)1 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)1 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)1 Reader (org.apache.hadoop.io.SequenceFile.Reader)1