use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class CompressionEmulationUtil method getPossiblyCompressedOutputStream.
/**
* Returns a {@link OutputStream} for a file that might need
* compression.
*/
static OutputStream getPossiblyCompressedOutputStream(Path file, Configuration conf) throws IOException {
FileSystem fs = file.getFileSystem(conf);
JobConf jConf = new JobConf(conf);
if (org.apache.hadoop.mapred.FileOutputFormat.getCompressOutput(jConf)) {
// get the codec class
Class<? extends CompressionCodec> codecClass = org.apache.hadoop.mapred.FileOutputFormat.getOutputCompressorClass(jConf, GzipCodec.class);
// get the codec implementation
CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
// add the appropriate extension
file = file.suffix(codec.getDefaultExtension());
if (isCompressionEmulationEnabled(conf)) {
FSDataOutputStream fileOut = fs.create(file, false);
return new DataOutputStream(codec.createOutputStream(fileOut));
}
}
return fs.create(file, false);
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class CompressionEmulationUtil method publishCompressedDataStatistics.
/** Publishes compression related data statistics. Following statistics are
* published
* <ul>
* <li>Total compressed input data size</li>
* <li>Number of compressed input data files</li>
* <li>Compression Ratio</li>
* <li>Text data dictionary size</li>
* <li>Random text word size</li>
* </ul>
*/
static DataStatistics publishCompressedDataStatistics(Path inputDir, Configuration conf, long uncompressedDataSize) throws IOException {
FileSystem fs = inputDir.getFileSystem(conf);
CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
// iterate over compressed files and sum up the compressed file sizes
long compressedDataSize = 0;
int numCompressedFiles = 0;
// obtain input data file statuses
FileStatus[] outFileStatuses = fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter());
for (FileStatus status : outFileStatuses) {
// check if the input file is compressed
if (compressionCodecs != null) {
CompressionCodec codec = compressionCodecs.getCodec(status.getPath());
if (codec != null) {
++numCompressedFiles;
compressedDataSize += status.getLen();
}
}
}
LOG.info("Gridmix is configured to use compressed input data.");
// publish the input data size
LOG.info("Total size of compressed input data : " + StringUtils.humanReadableInt(compressedDataSize));
LOG.info("Total number of compressed input data files : " + numCompressedFiles);
if (numCompressedFiles == 0) {
throw new RuntimeException("No compressed file found in the input" + " directory : " + inputDir.toString() + ". To enable compression" + " emulation, run Gridmix either with " + " an input directory containing compressed input file(s) or" + " use the -generate option to (re)generate it. If compression" + " emulation is not desired, disable it by setting '" + COMPRESSION_EMULATION_ENABLE + "' to 'false'.");
}
// publish compression ratio only if its generated in this gridmix run
if (uncompressedDataSize > 0) {
// compute the compression ratio
double ratio = ((double) compressedDataSize) / uncompressedDataSize;
// publish the compression ratio
LOG.info("Input Data Compression Ratio : " + ratio);
}
return new DataStatistics(compressedDataSize, numCompressedFiles, true);
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class TestShuffleScheduler method TestSucceedAndFailedCopyMap.
@SuppressWarnings("rawtypes")
@Test
public <K, V> void TestSucceedAndFailedCopyMap() throws Exception {
JobConf job = new JobConf();
job.setNumMapTasks(2);
//mock creation
TaskUmbilicalProtocol mockUmbilical = mock(TaskUmbilicalProtocol.class);
Reporter mockReporter = mock(Reporter.class);
FileSystem mockFileSystem = mock(FileSystem.class);
Class<? extends org.apache.hadoop.mapred.Reducer> combinerClass = job.getCombinerClass();
// needed for mock with generic
@SuppressWarnings("unchecked") CombineOutputCollector<K, V> mockCombineOutputCollector = (CombineOutputCollector<K, V>) mock(CombineOutputCollector.class);
org.apache.hadoop.mapreduce.TaskAttemptID mockTaskAttemptID = mock(org.apache.hadoop.mapreduce.TaskAttemptID.class);
LocalDirAllocator mockLocalDirAllocator = mock(LocalDirAllocator.class);
CompressionCodec mockCompressionCodec = mock(CompressionCodec.class);
Counter mockCounter = mock(Counter.class);
TaskStatus mockTaskStatus = mock(TaskStatus.class);
Progress mockProgress = mock(Progress.class);
MapOutputFile mockMapOutputFile = mock(MapOutputFile.class);
Task mockTask = mock(Task.class);
@SuppressWarnings("unchecked") MapOutput<K, V> output = mock(MapOutput.class);
ShuffleConsumerPlugin.Context<K, V> context = new ShuffleConsumerPlugin.Context<K, V>(mockTaskAttemptID, job, mockFileSystem, mockUmbilical, mockLocalDirAllocator, mockReporter, mockCompressionCodec, combinerClass, mockCombineOutputCollector, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockTaskStatus, mockProgress, mockProgress, mockTask, mockMapOutputFile, null);
TaskStatus status = new TaskStatus() {
@Override
public boolean getIsMap() {
return false;
}
@Override
public void addFetchFailedMap(TaskAttemptID mapTaskId) {
}
};
Progress progress = new Progress();
ShuffleSchedulerImpl<K, V> scheduler = new ShuffleSchedulerImpl<K, V>(job, status, null, null, progress, context.getShuffledMapsCounter(), context.getReduceShuffleBytes(), context.getFailedShuffleCounter());
MapHost host1 = new MapHost("host1", null);
TaskAttemptID failedAttemptID = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 0), 0);
TaskAttemptID succeedAttemptID = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 1), 1);
// handle output fetch failure for failedAttemptID, part I
scheduler.hostFailed(host1.getHostName());
// handle output fetch succeed for succeedAttemptID
long bytes = (long) 500 * 1024 * 1024;
scheduler.copySucceeded(succeedAttemptID, host1, bytes, 0, 500000, output);
// handle output fetch failure for failedAttemptID, part II
// for MAPREDUCE-6361: verify no NPE exception get thrown out
scheduler.copyFailed(failedAttemptID, host1, true, false);
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class TestConcatenatedCompressedInput method testBuiltInGzipDecompressor.
/**
* Test using the new BuiltInGzipDecompressor codec for reading gzip files.
*/
// NOTE: This fails on RHEL4 with "java.io.IOException: header crc mismatch"
// due to buggy version of zlib (1.2.1.2) included.
@Test
public void testBuiltInGzipDecompressor() throws IOException {
JobConf jobConf = new JobConf(defaultConf);
CompressionCodec gzip = new GzipCodec();
ReflectionUtils.setConf(gzip, jobConf);
localFs.delete(workDir, true);
// Don't use native libs for this test
ZlibFactory.setNativeZlibLoaded(false);
assertEquals("[non-native (Java) codec]", org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class, gzip.getDecompressorType());
System.out.println(COLOR_BR_YELLOW + "testBuiltInGzipDecompressor() using" + " non-native (Java Inflater) Decompressor (" + gzip.getDecompressorType() + ")" + COLOR_NORMAL);
// copy single-member test file to HDFS
String fn1 = "testConcatThenCompress.txt" + gzip.getDefaultExtension();
Path fnLocal1 = new Path(System.getProperty("test.concat.data", "/tmp"), fn1);
Path fnHDFS1 = new Path(workDir, fn1);
localFs.copyFromLocalFile(fnLocal1, fnHDFS1);
// copy multiple-member test file to HDFS
// (actually in "seekable gzip" format, a la JIRA PIG-42)
String fn2 = "testCompressThenConcat.txt" + gzip.getDefaultExtension();
Path fnLocal2 = new Path(System.getProperty("test.concat.data", "/tmp"), fn2);
Path fnHDFS2 = new Path(workDir, fn2);
localFs.copyFromLocalFile(fnLocal2, fnHDFS2);
FileInputFormat.setInputPaths(jobConf, workDir);
// here's first pair of DecompressorStreams:
final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
assertEquals("concat bytes available", 2734, in1.available());
// w/hdr CRC
assertEquals("concat bytes available", 3413, in2.available());
CompressionInputStream cin2 = gzip.createInputStream(in2);
LineReader in = new LineReader(cin2);
Text out = new Text();
int numBytes, totalBytes = 0, lineNum = 0;
while ((numBytes = in.readLine(out)) > 0) {
++lineNum;
totalBytes += numBytes;
}
in.close();
assertEquals("total uncompressed bytes in concatenated test file", 5346, totalBytes);
assertEquals("total uncompressed lines in concatenated test file", 84, lineNum);
ZlibFactory.loadNativeZLib();
// test GzipZlibDecompressor (native), just to be sure
// (FIXME? could move this call to testGzip(), but would need filename
// setup above) (alternatively, maybe just nuke testGzip() and extend this?)
doMultipleGzipBufferSizes(jobConf, true);
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class TestConcatenatedCompressedInput method testPrototypeInflaterGzip.
/**
* Test using the raw Inflater codec for reading gzip files.
*/
@Test
public void testPrototypeInflaterGzip() throws IOException {
// used only for file extension
CompressionCodec gzip = new GzipCodec();
// localFs = FileSystem instance
localFs.delete(workDir, true);
System.out.println(COLOR_BR_BLUE + "testPrototypeInflaterGzip() using " + "non-native/Java Inflater and manual gzip header/trailer parsing" + COLOR_NORMAL);
// copy prebuilt (correct!) version of concat.gz to HDFS
final String fn = "concat" + gzip.getDefaultExtension();
Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
Path fnHDFS = new Path(workDir, fn);
localFs.copyFromLocalFile(fnLocal, fnHDFS);
final FileInputStream in = new FileInputStream(fnLocal.toString());
assertEquals("concat bytes available", 148, in.available());
// should wrap all of this header-reading stuff in a running-CRC wrapper
// (did so in BuiltInGzipDecompressor; see below)
byte[] compressedBuf = new byte[256];
int numBytesRead = in.read(compressedBuf, 0, 10);
assertEquals("header bytes read", 10, numBytesRead);
assertEquals("1st byte", 0x1f, compressedBuf[0] & 0xff);
assertEquals("2nd byte", 0x8b, compressedBuf[1] & 0xff);
assertEquals("3rd byte (compression method)", 8, compressedBuf[2] & 0xff);
byte flags = (byte) (compressedBuf[3] & 0xff);
if ((flags & 0x04) != 0) {
// FEXTRA
numBytesRead = in.read(compressedBuf, 0, 2);
assertEquals("XLEN bytes read", 2, numBytesRead);
int xlen = ((compressedBuf[1] << 8) | compressedBuf[0]) & 0xffff;
in.skip(xlen);
}
if ((flags & 0x08) != 0) {
// FNAME
while ((numBytesRead = in.read()) != 0) {
assertFalse("unexpected end-of-file while reading filename", numBytesRead == -1);
}
}
if ((flags & 0x10) != 0) {
// FCOMMENT
while ((numBytesRead = in.read()) != 0) {
assertFalse("unexpected end-of-file while reading comment", numBytesRead == -1);
}
}
if ((flags & 0xe0) != 0) {
// reserved
assertTrue("reserved bits are set??", (flags & 0xe0) == 0);
}
if ((flags & 0x02) != 0) {
// FHCRC
numBytesRead = in.read(compressedBuf, 0, 2);
assertEquals("CRC16 bytes read", 2, numBytesRead);
int crc16 = ((compressedBuf[1] << 8) | compressedBuf[0]) & 0xffff;
}
// ready to go! next bytes should be start of deflated stream, suitable
// for Inflater
numBytesRead = in.read(compressedBuf);
// Inflater docs refer to a "dummy byte": no clue what that's about;
// appears to work fine without one
byte[] uncompressedBuf = new byte[256];
Inflater inflater = new Inflater(true);
inflater.setInput(compressedBuf, 0, numBytesRead);
try {
int numBytesUncompressed = inflater.inflate(uncompressedBuf);
String outString = new String(uncompressedBuf, 0, numBytesUncompressed, "UTF-8");
System.out.println("uncompressed data of first gzip member = [" + outString + "]");
} catch (java.util.zip.DataFormatException ex) {
throw new IOException(ex.getMessage());
}
in.close();
}
Aggregations