use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class SequenceFileOutputFormat method getSequenceWriter.
protected SequenceFile.Writer getSequenceWriter(TaskAttemptContext context, Class<?> keyClass, Class<?> valueClass) throws IOException {
Configuration conf = context.getConfiguration();
CompressionCodec codec = null;
CompressionType compressionType = CompressionType.NONE;
if (getCompressOutput(context)) {
// find the kind of compression to do
compressionType = getOutputCompressionType(context);
// find the right codec
Class<?> codecClass = getOutputCompressorClass(context, DefaultCodec.class);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
}
// get the path of the temporary output file
Path file = getDefaultWorkFile(context, "");
FileSystem fs = file.getFileSystem(conf);
return SequenceFile.createWriter(fs, conf, file, keyClass, valueClass, compressionType, codec, context);
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class TestShufflePlugin method testConsumerApi.
@Test
public /**
* A testing method verifying availability and accessibility of API that is needed
* for sub-classes of ShuffleConsumerPlugin
*/
void testConsumerApi() {
JobConf jobConf = new JobConf();
ShuffleConsumerPlugin<K, V> shuffleConsumerPlugin = new TestShuffleConsumerPlugin<K, V>();
//mock creation
ReduceTask mockReduceTask = mock(ReduceTask.class);
TaskUmbilicalProtocol mockUmbilical = mock(TaskUmbilicalProtocol.class);
Reporter mockReporter = mock(Reporter.class);
FileSystem mockFileSystem = mock(FileSystem.class);
Class<? extends org.apache.hadoop.mapred.Reducer> combinerClass = jobConf.getCombinerClass();
// needed for mock with generic
@SuppressWarnings("unchecked") CombineOutputCollector<K, V> mockCombineOutputCollector = (CombineOutputCollector<K, V>) mock(CombineOutputCollector.class);
org.apache.hadoop.mapreduce.TaskAttemptID mockTaskAttemptID = mock(org.apache.hadoop.mapreduce.TaskAttemptID.class);
LocalDirAllocator mockLocalDirAllocator = mock(LocalDirAllocator.class);
CompressionCodec mockCompressionCodec = mock(CompressionCodec.class);
Counter mockCounter = mock(Counter.class);
TaskStatus mockTaskStatus = mock(TaskStatus.class);
Progress mockProgress = mock(Progress.class);
MapOutputFile mockMapOutputFile = mock(MapOutputFile.class);
Task mockTask = mock(Task.class);
try {
String[] dirs = jobConf.getLocalDirs();
// verify that these APIs are available through super class handler
ShuffleConsumerPlugin.Context<K, V> context = new ShuffleConsumerPlugin.Context<K, V>(mockTaskAttemptID, jobConf, mockFileSystem, mockUmbilical, mockLocalDirAllocator, mockReporter, mockCompressionCodec, combinerClass, mockCombineOutputCollector, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockTaskStatus, mockProgress, mockProgress, mockTask, mockMapOutputFile, null);
shuffleConsumerPlugin.init(context);
shuffleConsumerPlugin.run();
shuffleConsumerPlugin.close();
} catch (Exception e) {
assertTrue("Threw exception:" + e, false);
}
// verify that these APIs are available for 3rd party plugins
mockReduceTask.getTaskID();
mockReduceTask.getJobID();
mockReduceTask.getNumMaps();
mockReduceTask.getPartition();
mockReporter.progress();
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class TestShuffleScheduler method TestAggregatedTransferRate.
@SuppressWarnings("rawtypes")
@Test
public <K, V> void TestAggregatedTransferRate() throws Exception {
JobConf job = new JobConf();
job.setNumMapTasks(10);
//mock creation
TaskUmbilicalProtocol mockUmbilical = mock(TaskUmbilicalProtocol.class);
Reporter mockReporter = mock(Reporter.class);
FileSystem mockFileSystem = mock(FileSystem.class);
Class<? extends org.apache.hadoop.mapred.Reducer> combinerClass = job.getCombinerClass();
// needed for mock with generic
@SuppressWarnings("unchecked") CombineOutputCollector<K, V> mockCombineOutputCollector = (CombineOutputCollector<K, V>) mock(CombineOutputCollector.class);
org.apache.hadoop.mapreduce.TaskAttemptID mockTaskAttemptID = mock(org.apache.hadoop.mapreduce.TaskAttemptID.class);
LocalDirAllocator mockLocalDirAllocator = mock(LocalDirAllocator.class);
CompressionCodec mockCompressionCodec = mock(CompressionCodec.class);
Counter mockCounter = mock(Counter.class);
TaskStatus mockTaskStatus = mock(TaskStatus.class);
Progress mockProgress = mock(Progress.class);
MapOutputFile mockMapOutputFile = mock(MapOutputFile.class);
Task mockTask = mock(Task.class);
@SuppressWarnings("unchecked") MapOutput<K, V> output = mock(MapOutput.class);
ShuffleConsumerPlugin.Context<K, V> context = new ShuffleConsumerPlugin.Context<K, V>(mockTaskAttemptID, job, mockFileSystem, mockUmbilical, mockLocalDirAllocator, mockReporter, mockCompressionCodec, combinerClass, mockCombineOutputCollector, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockTaskStatus, mockProgress, mockProgress, mockTask, mockMapOutputFile, null);
TaskStatus status = new TaskStatus() {
@Override
public boolean getIsMap() {
return false;
}
@Override
public void addFetchFailedMap(TaskAttemptID mapTaskId) {
}
};
Progress progress = new Progress();
ShuffleSchedulerImpl<K, V> scheduler = new ShuffleSchedulerImpl<K, V>(job, status, null, null, progress, context.getShuffledMapsCounter(), context.getReduceShuffleBytes(), context.getFailedShuffleCounter());
TaskAttemptID attemptID0 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 0), 0);
//adding the 1st interval, 40MB from 60s to 100s
long bytes = (long) 40 * 1024 * 1024;
scheduler.copySucceeded(attemptID0, new MapHost(null, null), bytes, 60000, 100000, output);
Assert.assertEquals(copyMessage(1, 1, 1), progress.toString());
TaskAttemptID attemptID1 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 1), 1);
//adding the 2nd interval before the 1st interval, 50MB from 0s to 50s
bytes = (long) 50 * 1024 * 1024;
scheduler.copySucceeded(attemptID1, new MapHost(null, null), bytes, 0, 50000, output);
Assert.assertEquals(copyMessage(2, 1, 1), progress.toString());
TaskAttemptID attemptID2 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 2), 2);
//adding the 3rd interval overlapping with the 1st and the 2nd interval
//110MB from 25s to 80s
bytes = (long) 110 * 1024 * 1024;
scheduler.copySucceeded(attemptID2, new MapHost(null, null), bytes, 25000, 80000, output);
Assert.assertEquals(copyMessage(3, 2, 2), progress.toString());
TaskAttemptID attemptID3 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 3), 3);
//adding the 4th interval just after the 2nd interval, 100MB from 100s to 300s
bytes = (long) 100 * 1024 * 1024;
scheduler.copySucceeded(attemptID3, new MapHost(null, null), bytes, 100000, 300000, output);
Assert.assertEquals(copyMessage(4, 0.5, 1), progress.toString());
TaskAttemptID attemptID4 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 4), 4);
//adding the 5th interval between after 4th, 50MB from 350s to 400s
bytes = (long) 50 * 1024 * 1024;
scheduler.copySucceeded(attemptID4, new MapHost(null, null), bytes, 350000, 400000, output);
Assert.assertEquals(copyMessage(5, 1, 1), progress.toString());
TaskAttemptID attemptID5 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 5), 5);
//adding the 6th interval between after 5th, 50MB from 450s to 500s
bytes = (long) 50 * 1024 * 1024;
scheduler.copySucceeded(attemptID5, new MapHost(null, null), bytes, 450000, 500000, output);
Assert.assertEquals(copyMessage(6, 1, 1), progress.toString());
TaskAttemptID attemptID6 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 6), 6);
//adding the 7th interval between after 5th and 6th interval, 20MB from 320s to 340s
bytes = (long) 20 * 1024 * 1024;
scheduler.copySucceeded(attemptID6, new MapHost(null, null), bytes, 320000, 340000, output);
Assert.assertEquals(copyMessage(7, 1, 1), progress.toString());
TaskAttemptID attemptID7 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 7), 7);
//adding the 8th interval overlapping with 4th, 5th, and 7th 30MB from 290s to 350s
bytes = (long) 30 * 1024 * 1024;
scheduler.copySucceeded(attemptID7, new MapHost(null, null), bytes, 290000, 350000, output);
Assert.assertEquals(copyMessage(8, 0.5, 1), progress.toString());
TaskAttemptID attemptID8 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 8), 8);
//adding the 9th interval overlapping with 5th and 6th, 50MB from 400s to 450s
bytes = (long) 50 * 1024 * 1024;
scheduler.copySucceeded(attemptID8, new MapHost(null, null), bytes, 400000, 450000, output);
Assert.assertEquals(copyMessage(9, 1, 1), progress.toString());
TaskAttemptID attemptID9 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 9), 9);
//adding the 10th interval overlapping with all intervals, 500MB from 0s to 500s
bytes = (long) 500 * 1024 * 1024;
scheduler.copySucceeded(attemptID9, new MapHost(null, null), bytes, 0, 500000, output);
Assert.assertEquals(copyMessage(10, 1, 2), progress.toString());
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class TestCombineTextInputFormat method testGzip.
/**
* Test using the gzip codec for reading
*/
@Test(timeout = 10000)
public void testGzip() throws IOException {
JobConf job = new JobConf(defaultConf);
CompressionCodec gzip = new GzipCodec();
ReflectionUtils.setConf(gzip, job);
localFs.delete(workDir, true);
writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n");
FileInputFormat.setInputPaths(job, workDir);
CombineTextInputFormat format = new CombineTextInputFormat();
InputSplit[] splits = format.getSplits(job, 100);
assertEquals("compressed splits == 1", 1, splits.length);
List<Text> results = readSplit(format, splits[0], job);
assertEquals("splits[0] length", 8, results.size());
final String[] firstList = { "the quick", "brown", "fox jumped", "over", " the lazy", " dog" };
final String[] secondList = { "this is a test", "of gzip" };
String first = results.get(0).toString();
if (first.equals(firstList[0])) {
testResults(results, firstList, secondList);
} else if (first.equals(secondList[0])) {
testResults(results, secondList, firstList);
} else {
fail("unexpected first token!");
}
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class TestConcatenatedCompressedInput method testBzip2.
/**
* Test using the bzip2 codec for reading
*/
@Test
public void testBzip2() throws IOException {
JobConf jobConf = new JobConf(defaultConf);
CompressionCodec bzip2 = new BZip2Codec();
ReflectionUtils.setConf(bzip2, jobConf);
localFs.delete(workDir, true);
System.out.println(COLOR_BR_CYAN + "testBzip2() using non-native CBZip2InputStream (presumably)" + COLOR_NORMAL);
// copy prebuilt (correct!) version of concat.bz2 to HDFS
final String fn = "concat" + bzip2.getDefaultExtension();
Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
Path fnHDFS = new Path(workDir, fn);
localFs.copyFromLocalFile(fnLocal, fnHDFS);
writeFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n");
FileInputFormat.setInputPaths(jobConf, workDir);
// extends FileInputFormat
TextInputFormat format = new TextInputFormat();
format.configure(jobConf);
// work around 2-byte splits issue
format.setMinSplitSize(256);
// [135 splits for a 208-byte file and a 62-byte file(!)]
InputSplit[] splits = format.getSplits(jobConf, 100);
assertEquals("compressed splits == 2", 2, splits.length);
FileSplit tmp = (FileSplit) splits[0];
if (tmp.getPath().getName().equals("part2.txt.bz2")) {
splits[0] = splits[1];
splits[1] = tmp;
}
List<Text> results = readSplit(format, splits[0], jobConf);
assertEquals("splits[0] num lines", 6, results.size());
assertEquals("splits[0][5]", "member #3", results.get(5).toString());
results = readSplit(format, splits[1], jobConf);
assertEquals("splits[1] num lines", 2, results.size());
assertEquals("splits[1][0]", "this is a test", results.get(0).toString());
assertEquals("splits[1][1]", "of bzip2", results.get(1).toString());
}
Aggregations