Examples with CompressionCodec - org.apache.hadoop.io.compress.CompressionCodec

Example 71 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class SequenceFileOutputFormat method getSequenceWriter.

protected SequenceFile.Writer getSequenceWriter(TaskAttemptContext context, Class<?> keyClass, Class<?> valueClass) throws IOException {
    Configuration conf = context.getConfiguration();
    CompressionCodec codec = null;
    CompressionType compressionType = CompressionType.NONE;
    if (getCompressOutput(context)) {
        // find the kind of compression to do
        compressionType = getOutputCompressionType(context);
        // find the right codec
        Class<?> codecClass = getOutputCompressorClass(context, DefaultCodec.class);
        codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
    }
    // get the path of the temporary output file 
    Path file = getDefaultWorkFile(context, "");
    FileSystem fs = file.getFileSystem(conf);
    return SequenceFile.createWriter(fs, conf, file, keyClass, valueClass, compressionType, codec, context);
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) CompressionType(org.apache.hadoop.io.SequenceFile.CompressionType)

Example 72 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class TestShufflePlugin method testConsumerApi.

@Test
public /**
   * A testing method verifying availability and accessibility of API that is needed
   * for sub-classes of ShuffleConsumerPlugin
   */
void testConsumerApi() {
    JobConf jobConf = new JobConf();
    ShuffleConsumerPlugin<K, V> shuffleConsumerPlugin = new TestShuffleConsumerPlugin<K, V>();
    //mock creation
    ReduceTask mockReduceTask = mock(ReduceTask.class);
    TaskUmbilicalProtocol mockUmbilical = mock(TaskUmbilicalProtocol.class);
    Reporter mockReporter = mock(Reporter.class);
    FileSystem mockFileSystem = mock(FileSystem.class);
    Class<? extends org.apache.hadoop.mapred.Reducer> combinerClass = jobConf.getCombinerClass();
    // needed for mock with generic
    @SuppressWarnings("unchecked") CombineOutputCollector<K, V> mockCombineOutputCollector = (CombineOutputCollector<K, V>) mock(CombineOutputCollector.class);
    org.apache.hadoop.mapreduce.TaskAttemptID mockTaskAttemptID = mock(org.apache.hadoop.mapreduce.TaskAttemptID.class);
    LocalDirAllocator mockLocalDirAllocator = mock(LocalDirAllocator.class);
    CompressionCodec mockCompressionCodec = mock(CompressionCodec.class);
    Counter mockCounter = mock(Counter.class);
    TaskStatus mockTaskStatus = mock(TaskStatus.class);
    Progress mockProgress = mock(Progress.class);
    MapOutputFile mockMapOutputFile = mock(MapOutputFile.class);
    Task mockTask = mock(Task.class);
    try {
        String[] dirs = jobConf.getLocalDirs();
        // verify that these APIs are available through super class handler
        ShuffleConsumerPlugin.Context<K, V> context = new ShuffleConsumerPlugin.Context<K, V>(mockTaskAttemptID, jobConf, mockFileSystem, mockUmbilical, mockLocalDirAllocator, mockReporter, mockCompressionCodec, combinerClass, mockCombineOutputCollector, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockTaskStatus, mockProgress, mockProgress, mockTask, mockMapOutputFile, null);
        shuffleConsumerPlugin.init(context);
        shuffleConsumerPlugin.run();
        shuffleConsumerPlugin.close();
    } catch (Exception e) {
        assertTrue("Threw exception:" + e, false);
    }
    // verify that these APIs are available for 3rd party plugins
    mockReduceTask.getTaskID();
    mockReduceTask.getJobID();
    mockReduceTask.getNumMaps();
    mockReduceTask.getPartition();
    mockReporter.progress();
}

Also used : ReduceTask(org.apache.hadoop.mapred.ReduceTask) Task(org.apache.hadoop.mapred.Task) ShuffleConsumerPlugin(org.apache.hadoop.mapred.ShuffleConsumerPlugin) Counter(org.apache.hadoop.mapred.Counters.Counter) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) JobConf(org.apache.hadoop.mapred.JobConf) MapOutputFile(org.apache.hadoop.mapred.MapOutputFile) Progress(org.apache.hadoop.util.Progress) Reporter(org.apache.hadoop.mapred.Reporter) TaskStatus(org.apache.hadoop.mapred.TaskStatus) CombineOutputCollector(org.apache.hadoop.mapred.Task.CombineOutputCollector) TaskUmbilicalProtocol(org.apache.hadoop.mapred.TaskUmbilicalProtocol) ReduceTask(org.apache.hadoop.mapred.ReduceTask) LocalDirAllocator(org.apache.hadoop.fs.LocalDirAllocator) Test(org.junit.Test)

Example 73 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class TestShuffleScheduler method TestAggregatedTransferRate.

@SuppressWarnings("rawtypes")
@Test
public <K, V> void TestAggregatedTransferRate() throws Exception {
    JobConf job = new JobConf();
    job.setNumMapTasks(10);
    //mock creation
    TaskUmbilicalProtocol mockUmbilical = mock(TaskUmbilicalProtocol.class);
    Reporter mockReporter = mock(Reporter.class);
    FileSystem mockFileSystem = mock(FileSystem.class);
    Class<? extends org.apache.hadoop.mapred.Reducer> combinerClass = job.getCombinerClass();
    // needed for mock with generic
    @SuppressWarnings("unchecked") CombineOutputCollector<K, V> mockCombineOutputCollector = (CombineOutputCollector<K, V>) mock(CombineOutputCollector.class);
    org.apache.hadoop.mapreduce.TaskAttemptID mockTaskAttemptID = mock(org.apache.hadoop.mapreduce.TaskAttemptID.class);
    LocalDirAllocator mockLocalDirAllocator = mock(LocalDirAllocator.class);
    CompressionCodec mockCompressionCodec = mock(CompressionCodec.class);
    Counter mockCounter = mock(Counter.class);
    TaskStatus mockTaskStatus = mock(TaskStatus.class);
    Progress mockProgress = mock(Progress.class);
    MapOutputFile mockMapOutputFile = mock(MapOutputFile.class);
    Task mockTask = mock(Task.class);
    @SuppressWarnings("unchecked") MapOutput<K, V> output = mock(MapOutput.class);
    ShuffleConsumerPlugin.Context<K, V> context = new ShuffleConsumerPlugin.Context<K, V>(mockTaskAttemptID, job, mockFileSystem, mockUmbilical, mockLocalDirAllocator, mockReporter, mockCompressionCodec, combinerClass, mockCombineOutputCollector, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockTaskStatus, mockProgress, mockProgress, mockTask, mockMapOutputFile, null);
    TaskStatus status = new TaskStatus() {

        @Override
        public boolean getIsMap() {
            return false;
        }

        @Override
        public void addFetchFailedMap(TaskAttemptID mapTaskId) {
        }
    };
    Progress progress = new Progress();
    ShuffleSchedulerImpl<K, V> scheduler = new ShuffleSchedulerImpl<K, V>(job, status, null, null, progress, context.getShuffledMapsCounter(), context.getReduceShuffleBytes(), context.getFailedShuffleCounter());
    TaskAttemptID attemptID0 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 0), 0);
    //adding the 1st interval, 40MB from 60s to 100s
    long bytes = (long) 40 * 1024 * 1024;
    scheduler.copySucceeded(attemptID0, new MapHost(null, null), bytes, 60000, 100000, output);
    Assert.assertEquals(copyMessage(1, 1, 1), progress.toString());
    TaskAttemptID attemptID1 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 1), 1);
    //adding the 2nd interval before the 1st interval, 50MB from 0s to 50s
    bytes = (long) 50 * 1024 * 1024;
    scheduler.copySucceeded(attemptID1, new MapHost(null, null), bytes, 0, 50000, output);
    Assert.assertEquals(copyMessage(2, 1, 1), progress.toString());
    TaskAttemptID attemptID2 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 2), 2);
    //adding the 3rd interval overlapping with the 1st and the 2nd interval
    //110MB from 25s to 80s
    bytes = (long) 110 * 1024 * 1024;
    scheduler.copySucceeded(attemptID2, new MapHost(null, null), bytes, 25000, 80000, output);
    Assert.assertEquals(copyMessage(3, 2, 2), progress.toString());
    TaskAttemptID attemptID3 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 3), 3);
    //adding the 4th interval just after the 2nd interval, 100MB from 100s to 300s
    bytes = (long) 100 * 1024 * 1024;
    scheduler.copySucceeded(attemptID3, new MapHost(null, null), bytes, 100000, 300000, output);
    Assert.assertEquals(copyMessage(4, 0.5, 1), progress.toString());
    TaskAttemptID attemptID4 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 4), 4);
    //adding the 5th interval between after 4th, 50MB from 350s to 400s
    bytes = (long) 50 * 1024 * 1024;
    scheduler.copySucceeded(attemptID4, new MapHost(null, null), bytes, 350000, 400000, output);
    Assert.assertEquals(copyMessage(5, 1, 1), progress.toString());
    TaskAttemptID attemptID5 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 5), 5);
    //adding the 6th interval between after 5th, 50MB from 450s to 500s
    bytes = (long) 50 * 1024 * 1024;
    scheduler.copySucceeded(attemptID5, new MapHost(null, null), bytes, 450000, 500000, output);
    Assert.assertEquals(copyMessage(6, 1, 1), progress.toString());
    TaskAttemptID attemptID6 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 6), 6);
    //adding the 7th interval between after 5th and 6th interval, 20MB from 320s to 340s
    bytes = (long) 20 * 1024 * 1024;
    scheduler.copySucceeded(attemptID6, new MapHost(null, null), bytes, 320000, 340000, output);
    Assert.assertEquals(copyMessage(7, 1, 1), progress.toString());
    TaskAttemptID attemptID7 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 7), 7);
    //adding the 8th interval overlapping with 4th, 5th, and 7th 30MB from 290s to 350s
    bytes = (long) 30 * 1024 * 1024;
    scheduler.copySucceeded(attemptID7, new MapHost(null, null), bytes, 290000, 350000, output);
    Assert.assertEquals(copyMessage(8, 0.5, 1), progress.toString());
    TaskAttemptID attemptID8 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 8), 8);
    //adding the 9th interval overlapping with 5th and 6th, 50MB from 400s to 450s
    bytes = (long) 50 * 1024 * 1024;
    scheduler.copySucceeded(attemptID8, new MapHost(null, null), bytes, 400000, 450000, output);
    Assert.assertEquals(copyMessage(9, 1, 1), progress.toString());
    TaskAttemptID attemptID9 = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 9), 9);
    //adding the 10th interval overlapping with all intervals, 500MB from 0s to 500s
    bytes = (long) 500 * 1024 * 1024;
    scheduler.copySucceeded(attemptID9, new MapHost(null, null), bytes, 0, 500000, output);
    Assert.assertEquals(copyMessage(10, 1, 2), progress.toString());
}

Also used : Task(org.apache.hadoop.mapred.Task) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) ShuffleConsumerPlugin(org.apache.hadoop.mapred.ShuffleConsumerPlugin) Counter(org.apache.hadoop.mapred.Counters.Counter) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) JobConf(org.apache.hadoop.mapred.JobConf) MapOutputFile(org.apache.hadoop.mapred.MapOutputFile) Progress(org.apache.hadoop.util.Progress) Reporter(org.apache.hadoop.mapred.Reporter) TaskStatus(org.apache.hadoop.mapred.TaskStatus) CombineOutputCollector(org.apache.hadoop.mapred.Task.CombineOutputCollector) TaskUmbilicalProtocol(org.apache.hadoop.mapred.TaskUmbilicalProtocol) LocalDirAllocator(org.apache.hadoop.fs.LocalDirAllocator) JobID(org.apache.hadoop.mapreduce.JobID) Test(org.junit.Test)

Example 74 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class TestCombineTextInputFormat method testGzip.

/**
   * Test using the gzip codec for reading
   */
@Test(timeout = 10000)
public void testGzip() throws IOException {
    JobConf job = new JobConf(defaultConf);
    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, job);
    localFs.delete(workDir, true);
    writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n");
    FileInputFormat.setInputPaths(job, workDir);
    CombineTextInputFormat format = new CombineTextInputFormat();
    InputSplit[] splits = format.getSplits(job, 100);
    assertEquals("compressed splits == 1", 1, splits.length);
    List<Text> results = readSplit(format, splits[0], job);
    assertEquals("splits[0] length", 8, results.size());
    final String[] firstList = { "the quick", "brown", "fox jumped", "over", " the lazy", " dog" };
    final String[] secondList = { "this is a test", "of gzip" };
    String first = results.get(0).toString();
    if (first.equals(firstList[0])) {
        testResults(results, firstList, secondList);
    } else if (first.equals(secondList[0])) {
        testResults(results, secondList, firstList);
    } else {
        fail("unexpected first token!");
    }
}

Also used : Path(org.apache.hadoop.fs.Path) CombineTextInputFormat(org.apache.hadoop.mapred.lib.CombineTextInputFormat) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) Text(org.apache.hadoop.io.Text) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Test(org.junit.Test)

Example 75 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class TestConcatenatedCompressedInput method testBzip2.

/**
   * Test using the bzip2 codec for reading
   */
@Test
public void testBzip2() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);
    CompressionCodec bzip2 = new BZip2Codec();
    ReflectionUtils.setConf(bzip2, jobConf);
    localFs.delete(workDir, true);
    System.out.println(COLOR_BR_CYAN + "testBzip2() using non-native CBZip2InputStream (presumably)" + COLOR_NORMAL);
    // copy prebuilt (correct!) version of concat.bz2 to HDFS
    final String fn = "concat" + bzip2.getDefaultExtension();
    Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
    Path fnHDFS = new Path(workDir, fn);
    localFs.copyFromLocalFile(fnLocal, fnHDFS);
    writeFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n");
    FileInputFormat.setInputPaths(jobConf, workDir);
    // extends FileInputFormat
    TextInputFormat format = new TextInputFormat();
    format.configure(jobConf);
    // work around 2-byte splits issue
    format.setMinSplitSize(256);
    // [135 splits for a 208-byte file and a 62-byte file(!)]
    InputSplit[] splits = format.getSplits(jobConf, 100);
    assertEquals("compressed splits == 2", 2, splits.length);
    FileSplit tmp = (FileSplit) splits[0];
    if (tmp.getPath().getName().equals("part2.txt.bz2")) {
        splits[0] = splits[1];
        splits[1] = tmp;
    }
    List<Text> results = readSplit(format, splits[0], jobConf);
    assertEquals("splits[0] num lines", 6, results.size());
    assertEquals("splits[0][5]", "member #3", results.get(5).toString());
    results = readSplit(format, splits[1], jobConf);
    assertEquals("splits[1] num lines", 2, results.size());
    assertEquals("splits[1][0]", "this is a test", results.get(0).toString());
    assertEquals("splits[1][1]", "of bzip2", results.get(1).toString());
}

Also used : Path(org.apache.hadoop.fs.Path) BZip2Codec(org.apache.hadoop.io.compress.BZip2Codec) Text(org.apache.hadoop.io.Text) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Test(org.junit.Test)

Aggregations

CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)111 Path (org.apache.hadoop.fs.Path)54 FileSystem (org.apache.hadoop.fs.FileSystem)41 Configuration (org.apache.hadoop.conf.Configuration)38 CompressionCodecFactory (org.apache.hadoop.io.compress.CompressionCodecFactory)37 InputStream (java.io.InputStream)18 IOException (java.io.IOException)17 Test (org.junit.Test)17 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)15 Text (org.apache.hadoop.io.Text)14 Configurable (org.apache.hadoop.conf.Configurable)10 GzipCodec (org.apache.hadoop.io.compress.GzipCodec)10 JobConf (org.apache.hadoop.mapred.JobConf)10 SequenceFile (org.apache.hadoop.io.SequenceFile)9 OutputStream (java.io.OutputStream)8 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)8 FileInputStream (java.io.FileInputStream)7 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)6 ByteString (com.google.protobuf.ByteString)5 DataInputStream (java.io.DataInputStream)5