use of org.apache.hadoop.io.compress.GzipCodec in project hadoop by apache.
the class TestSequenceFileAppend method testAppendRecordCompression.
@Test(timeout = 30000)
public void testAppendRecordCompression() throws Exception {
GenericTestUtils.assumeInNativeProfile();
Path file = new Path(ROOT_PATH, "testseqappendblockcompr.seq");
fs.delete(file, true);
Option compressOption = Writer.compression(CompressionType.RECORD, new GzipCodec());
Writer writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), compressOption);
writer.append(1L, "one");
writer.append(2L, "two");
writer.close();
verify2Values(file);
writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), compressOption);
writer.append(3L, "three");
writer.append(4L, "four");
writer.close();
verifyAll4Values(file);
fs.deleteOnExit(file);
}
use of org.apache.hadoop.io.compress.GzipCodec in project hadoop by apache.
the class TestSequenceFileAppend method testAppendBlockCompression.
@Test(timeout = 30000)
public void testAppendBlockCompression() throws Exception {
GenericTestUtils.assumeInNativeProfile();
Path file = new Path(ROOT_PATH, "testseqappendblockcompr.seq");
fs.delete(file, true);
Option compressOption = Writer.compression(CompressionType.BLOCK, new GzipCodec());
Writer writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), compressOption);
writer.append(1L, "one");
writer.append(2L, "two");
writer.close();
verify2Values(file);
writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), compressOption);
writer.append(3L, "three");
writer.append(4L, "four");
writer.close();
verifyAll4Values(file);
// Verify failure if the compression details are different or not Provided
try {
writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true));
writer.close();
fail("Expected IllegalArgumentException for compression options");
} catch (IllegalArgumentException IAE) {
// Expected exception. Ignore it
}
// Verify failure if the compression details are different
try {
Option wrongCompressOption = Writer.compression(CompressionType.RECORD, new GzipCodec());
writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), wrongCompressOption);
writer.close();
fail("Expected IllegalArgumentException for compression options");
} catch (IllegalArgumentException IAE) {
// Expected exception. Ignore it
}
try {
Option wrongCompressOption = Writer.compression(CompressionType.BLOCK, new DefaultCodec());
writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), wrongCompressOption);
writer.close();
fail("Expected IllegalArgumentException for compression options");
} catch (IllegalArgumentException IAE) {
// Expected exception. Ignore it
}
fs.deleteOnExit(file);
}
use of org.apache.hadoop.io.compress.GzipCodec in project hadoop by apache.
the class TestCombineTextInputFormat method testGzip.
/**
* Test using the gzip codec for reading
*/
@Test(timeout = 10000)
public void testGzip() throws IOException, InterruptedException {
Configuration conf = new Configuration(defaultConf);
CompressionCodec gzip = new GzipCodec();
ReflectionUtils.setConf(gzip, conf);
localFs.delete(workDir, true);
writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n");
Job job = Job.getInstance(conf);
FileInputFormat.setInputPaths(job, workDir);
CombineTextInputFormat format = new CombineTextInputFormat();
List<InputSplit> splits = format.getSplits(job);
assertEquals("compressed splits == 1", 1, splits.size());
List<Text> results = readSplit(format, splits.get(0), job);
assertEquals("splits[0] length", 8, results.size());
final String[] firstList = { "the quick", "brown", "fox jumped", "over", " the lazy", " dog" };
final String[] secondList = { "this is a test", "of gzip" };
String first = results.get(0).toString();
if (first.equals(firstList[0])) {
testResults(results, firstList, secondList);
} else if (first.equals(secondList[0])) {
testResults(results, secondList, firstList);
} else {
fail("unexpected first token!");
}
}
use of org.apache.hadoop.io.compress.GzipCodec in project hadoop by apache.
the class TestConcatenatedCompressedInput method testBuiltInGzipDecompressor.
/**
* Test using the new BuiltInGzipDecompressor codec for reading gzip files.
*/
// NOTE: This fails on RHEL4 with "java.io.IOException: header crc mismatch"
// due to buggy version of zlib (1.2.1.2) included.
@Test
public void testBuiltInGzipDecompressor() throws IOException {
JobConf jobConf = new JobConf(defaultConf);
CompressionCodec gzip = new GzipCodec();
ReflectionUtils.setConf(gzip, jobConf);
localFs.delete(workDir, true);
// Don't use native libs for this test
ZlibFactory.setNativeZlibLoaded(false);
assertEquals("[non-native (Java) codec]", org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class, gzip.getDecompressorType());
System.out.println(COLOR_BR_YELLOW + "testBuiltInGzipDecompressor() using" + " non-native (Java Inflater) Decompressor (" + gzip.getDecompressorType() + ")" + COLOR_NORMAL);
// copy single-member test file to HDFS
String fn1 = "testConcatThenCompress.txt" + gzip.getDefaultExtension();
Path fnLocal1 = new Path(System.getProperty("test.concat.data", "/tmp"), fn1);
Path fnHDFS1 = new Path(workDir, fn1);
localFs.copyFromLocalFile(fnLocal1, fnHDFS1);
// copy multiple-member test file to HDFS
// (actually in "seekable gzip" format, a la JIRA PIG-42)
String fn2 = "testCompressThenConcat.txt" + gzip.getDefaultExtension();
Path fnLocal2 = new Path(System.getProperty("test.concat.data", "/tmp"), fn2);
Path fnHDFS2 = new Path(workDir, fn2);
localFs.copyFromLocalFile(fnLocal2, fnHDFS2);
FileInputFormat.setInputPaths(jobConf, workDir);
// here's first pair of DecompressorStreams:
final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
assertEquals("concat bytes available", 2734, in1.available());
// w/hdr CRC
assertEquals("concat bytes available", 3413, in2.available());
CompressionInputStream cin2 = gzip.createInputStream(in2);
LineReader in = new LineReader(cin2);
Text out = new Text();
int numBytes, totalBytes = 0, lineNum = 0;
while ((numBytes = in.readLine(out)) > 0) {
++lineNum;
totalBytes += numBytes;
}
in.close();
assertEquals("total uncompressed bytes in concatenated test file", 5346, totalBytes);
assertEquals("total uncompressed lines in concatenated test file", 84, lineNum);
ZlibFactory.loadNativeZLib();
// test GzipZlibDecompressor (native), just to be sure
// (FIXME? could move this call to testGzip(), but would need filename
// setup above) (alternatively, maybe just nuke testGzip() and extend this?)
doMultipleGzipBufferSizes(jobConf, true);
}
use of org.apache.hadoop.io.compress.GzipCodec in project hadoop by apache.
the class TestConcatenatedCompressedInput method testPrototypeInflaterGzip.
/**
* Test using the raw Inflater codec for reading gzip files.
*/
@Test
public void testPrototypeInflaterGzip() throws IOException {
// used only for file extension
CompressionCodec gzip = new GzipCodec();
// localFs = FileSystem instance
localFs.delete(workDir, true);
System.out.println(COLOR_BR_BLUE + "testPrototypeInflaterGzip() using " + "non-native/Java Inflater and manual gzip header/trailer parsing" + COLOR_NORMAL);
// copy prebuilt (correct!) version of concat.gz to HDFS
final String fn = "concat" + gzip.getDefaultExtension();
Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
Path fnHDFS = new Path(workDir, fn);
localFs.copyFromLocalFile(fnLocal, fnHDFS);
final FileInputStream in = new FileInputStream(fnLocal.toString());
assertEquals("concat bytes available", 148, in.available());
// should wrap all of this header-reading stuff in a running-CRC wrapper
// (did so in BuiltInGzipDecompressor; see below)
byte[] compressedBuf = new byte[256];
int numBytesRead = in.read(compressedBuf, 0, 10);
assertEquals("header bytes read", 10, numBytesRead);
assertEquals("1st byte", 0x1f, compressedBuf[0] & 0xff);
assertEquals("2nd byte", 0x8b, compressedBuf[1] & 0xff);
assertEquals("3rd byte (compression method)", 8, compressedBuf[2] & 0xff);
byte flags = (byte) (compressedBuf[3] & 0xff);
if ((flags & 0x04) != 0) {
// FEXTRA
numBytesRead = in.read(compressedBuf, 0, 2);
assertEquals("XLEN bytes read", 2, numBytesRead);
int xlen = ((compressedBuf[1] << 8) | compressedBuf[0]) & 0xffff;
in.skip(xlen);
}
if ((flags & 0x08) != 0) {
// FNAME
while ((numBytesRead = in.read()) != 0) {
assertFalse("unexpected end-of-file while reading filename", numBytesRead == -1);
}
}
if ((flags & 0x10) != 0) {
// FCOMMENT
while ((numBytesRead = in.read()) != 0) {
assertFalse("unexpected end-of-file while reading comment", numBytesRead == -1);
}
}
if ((flags & 0xe0) != 0) {
// reserved
assertTrue("reserved bits are set??", (flags & 0xe0) == 0);
}
if ((flags & 0x02) != 0) {
// FHCRC
numBytesRead = in.read(compressedBuf, 0, 2);
assertEquals("CRC16 bytes read", 2, numBytesRead);
int crc16 = ((compressedBuf[1] << 8) | compressedBuf[0]) & 0xffff;
}
// ready to go! next bytes should be start of deflated stream, suitable
// for Inflater
numBytesRead = in.read(compressedBuf);
// Inflater docs refer to a "dummy byte": no clue what that's about;
// appears to work fine without one
byte[] uncompressedBuf = new byte[256];
Inflater inflater = new Inflater(true);
inflater.setInput(compressedBuf, 0, numBytesRead);
try {
int numBytesUncompressed = inflater.inflate(uncompressedBuf);
String outString = new String(uncompressedBuf, 0, numBytesUncompressed, "UTF-8");
System.out.println("uncompressed data of first gzip member = [" + outString + "]");
} catch (java.util.zip.DataFormatException ex) {
throw new IOException(ex.getMessage());
}
in.close();
}
Aggregations