Search in sources :

Example 6 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class CompressionEmulationUtil method getPossiblyCompressedOutputStream.

/**
   * Returns a {@link OutputStream} for a file that might need 
   * compression.
   */
static OutputStream getPossiblyCompressedOutputStream(Path file, Configuration conf) throws IOException {
    FileSystem fs = file.getFileSystem(conf);
    JobConf jConf = new JobConf(conf);
    if (org.apache.hadoop.mapred.FileOutputFormat.getCompressOutput(jConf)) {
        // get the codec class
        Class<? extends CompressionCodec> codecClass = org.apache.hadoop.mapred.FileOutputFormat.getOutputCompressorClass(jConf, GzipCodec.class);
        // get the codec implementation
        CompressionCodec codec = ReflectionUtils.newInstance(codecClass, conf);
        // add the appropriate extension
        file = file.suffix(codec.getDefaultExtension());
        if (isCompressionEmulationEnabled(conf)) {
            FSDataOutputStream fileOut = fs.create(file, false);
            return new DataOutputStream(codec.createOutputStream(fileOut));
        }
    }
    return fs.create(file, false);
}
Also used : FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) DataOutputStream(java.io.DataOutputStream) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) JobConf(org.apache.hadoop.mapred.JobConf)

Example 7 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class CompressionEmulationUtil method publishCompressedDataStatistics.

/** Publishes compression related data statistics. Following statistics are
   * published
   * <ul>
   *   <li>Total compressed input data size</li>
   *   <li>Number of compressed input data files</li>
   *   <li>Compression Ratio</li>
   *   <li>Text data dictionary size</li>
   *   <li>Random text word size</li>
   * </ul>
   */
static DataStatistics publishCompressedDataStatistics(Path inputDir, Configuration conf, long uncompressedDataSize) throws IOException {
    FileSystem fs = inputDir.getFileSystem(conf);
    CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
    // iterate over compressed files and sum up the compressed file sizes
    long compressedDataSize = 0;
    int numCompressedFiles = 0;
    // obtain input data file statuses
    FileStatus[] outFileStatuses = fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter());
    for (FileStatus status : outFileStatuses) {
        // check if the input file is compressed
        if (compressionCodecs != null) {
            CompressionCodec codec = compressionCodecs.getCodec(status.getPath());
            if (codec != null) {
                ++numCompressedFiles;
                compressedDataSize += status.getLen();
            }
        }
    }
    LOG.info("Gridmix is configured to use compressed input data.");
    // publish the input data size
    LOG.info("Total size of compressed input data : " + StringUtils.humanReadableInt(compressedDataSize));
    LOG.info("Total number of compressed input data files : " + numCompressedFiles);
    if (numCompressedFiles == 0) {
        throw new RuntimeException("No compressed file found in the input" + " directory : " + inputDir.toString() + ". To enable compression" + " emulation, run Gridmix either with " + " an input directory containing compressed input file(s) or" + " use the -generate option to (re)generate it. If compression" + " emulation is not desired, disable it by setting '" + COMPRESSION_EMULATION_ENABLE + "' to 'false'.");
    }
    // publish compression ratio only if its generated in this gridmix run
    if (uncompressedDataSize > 0) {
        // compute the compression ratio
        double ratio = ((double) compressedDataSize) / uncompressedDataSize;
        // publish the compression ratio
        LOG.info("Input Data Compression Ratio : " + ratio);
    }
    return new DataStatistics(compressedDataSize, numCompressedFiles, true);
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) DataStatistics(org.apache.hadoop.mapred.gridmix.GenerateData.DataStatistics) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 8 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class TestShuffleScheduler method TestSucceedAndFailedCopyMap.

@SuppressWarnings("rawtypes")
@Test
public <K, V> void TestSucceedAndFailedCopyMap() throws Exception {
    JobConf job = new JobConf();
    job.setNumMapTasks(2);
    //mock creation
    TaskUmbilicalProtocol mockUmbilical = mock(TaskUmbilicalProtocol.class);
    Reporter mockReporter = mock(Reporter.class);
    FileSystem mockFileSystem = mock(FileSystem.class);
    Class<? extends org.apache.hadoop.mapred.Reducer> combinerClass = job.getCombinerClass();
    // needed for mock with generic
    @SuppressWarnings("unchecked") CombineOutputCollector<K, V> mockCombineOutputCollector = (CombineOutputCollector<K, V>) mock(CombineOutputCollector.class);
    org.apache.hadoop.mapreduce.TaskAttemptID mockTaskAttemptID = mock(org.apache.hadoop.mapreduce.TaskAttemptID.class);
    LocalDirAllocator mockLocalDirAllocator = mock(LocalDirAllocator.class);
    CompressionCodec mockCompressionCodec = mock(CompressionCodec.class);
    Counter mockCounter = mock(Counter.class);
    TaskStatus mockTaskStatus = mock(TaskStatus.class);
    Progress mockProgress = mock(Progress.class);
    MapOutputFile mockMapOutputFile = mock(MapOutputFile.class);
    Task mockTask = mock(Task.class);
    @SuppressWarnings("unchecked") MapOutput<K, V> output = mock(MapOutput.class);
    ShuffleConsumerPlugin.Context<K, V> context = new ShuffleConsumerPlugin.Context<K, V>(mockTaskAttemptID, job, mockFileSystem, mockUmbilical, mockLocalDirAllocator, mockReporter, mockCompressionCodec, combinerClass, mockCombineOutputCollector, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockCounter, mockTaskStatus, mockProgress, mockProgress, mockTask, mockMapOutputFile, null);
    TaskStatus status = new TaskStatus() {

        @Override
        public boolean getIsMap() {
            return false;
        }

        @Override
        public void addFetchFailedMap(TaskAttemptID mapTaskId) {
        }
    };
    Progress progress = new Progress();
    ShuffleSchedulerImpl<K, V> scheduler = new ShuffleSchedulerImpl<K, V>(job, status, null, null, progress, context.getShuffledMapsCounter(), context.getReduceShuffleBytes(), context.getFailedShuffleCounter());
    MapHost host1 = new MapHost("host1", null);
    TaskAttemptID failedAttemptID = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 0), 0);
    TaskAttemptID succeedAttemptID = new TaskAttemptID(new org.apache.hadoop.mapred.TaskID(new JobID("test", 0), TaskType.MAP, 1), 1);
    // handle output fetch failure for failedAttemptID, part I
    scheduler.hostFailed(host1.getHostName());
    // handle output fetch succeed for succeedAttemptID
    long bytes = (long) 500 * 1024 * 1024;
    scheduler.copySucceeded(succeedAttemptID, host1, bytes, 0, 500000, output);
    // handle output fetch failure for failedAttemptID, part II
    // for MAPREDUCE-6361: verify no NPE exception get thrown out
    scheduler.copyFailed(failedAttemptID, host1, true, false);
}
Also used : Task(org.apache.hadoop.mapred.Task) TaskAttemptID(org.apache.hadoop.mapred.TaskAttemptID) ShuffleConsumerPlugin(org.apache.hadoop.mapred.ShuffleConsumerPlugin) Counter(org.apache.hadoop.mapred.Counters.Counter) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) JobConf(org.apache.hadoop.mapred.JobConf) MapOutputFile(org.apache.hadoop.mapred.MapOutputFile) Progress(org.apache.hadoop.util.Progress) Reporter(org.apache.hadoop.mapred.Reporter) TaskStatus(org.apache.hadoop.mapred.TaskStatus) CombineOutputCollector(org.apache.hadoop.mapred.Task.CombineOutputCollector) TaskUmbilicalProtocol(org.apache.hadoop.mapred.TaskUmbilicalProtocol) LocalDirAllocator(org.apache.hadoop.fs.LocalDirAllocator) JobID(org.apache.hadoop.mapreduce.JobID) Test(org.junit.Test)

Example 9 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class TestConcatenatedCompressedInput method testBuiltInGzipDecompressor.

/**
   * Test using the new BuiltInGzipDecompressor codec for reading gzip files.
   */
// NOTE:  This fails on RHEL4 with "java.io.IOException: header crc mismatch"
//        due to buggy version of zlib (1.2.1.2) included.
@Test
public void testBuiltInGzipDecompressor() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);
    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, jobConf);
    localFs.delete(workDir, true);
    // Don't use native libs for this test
    ZlibFactory.setNativeZlibLoaded(false);
    assertEquals("[non-native (Java) codec]", org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class, gzip.getDecompressorType());
    System.out.println(COLOR_BR_YELLOW + "testBuiltInGzipDecompressor() using" + " non-native (Java Inflater) Decompressor (" + gzip.getDecompressorType() + ")" + COLOR_NORMAL);
    // copy single-member test file to HDFS
    String fn1 = "testConcatThenCompress.txt" + gzip.getDefaultExtension();
    Path fnLocal1 = new Path(System.getProperty("test.concat.data", "/tmp"), fn1);
    Path fnHDFS1 = new Path(workDir, fn1);
    localFs.copyFromLocalFile(fnLocal1, fnHDFS1);
    // copy multiple-member test file to HDFS
    // (actually in "seekable gzip" format, a la JIRA PIG-42)
    String fn2 = "testCompressThenConcat.txt" + gzip.getDefaultExtension();
    Path fnLocal2 = new Path(System.getProperty("test.concat.data", "/tmp"), fn2);
    Path fnHDFS2 = new Path(workDir, fn2);
    localFs.copyFromLocalFile(fnLocal2, fnHDFS2);
    FileInputFormat.setInputPaths(jobConf, workDir);
    // here's first pair of DecompressorStreams:
    final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
    final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
    assertEquals("concat bytes available", 2734, in1.available());
    // w/hdr CRC
    assertEquals("concat bytes available", 3413, in2.available());
    CompressionInputStream cin2 = gzip.createInputStream(in2);
    LineReader in = new LineReader(cin2);
    Text out = new Text();
    int numBytes, totalBytes = 0, lineNum = 0;
    while ((numBytes = in.readLine(out)) > 0) {
        ++lineNum;
        totalBytes += numBytes;
    }
    in.close();
    assertEquals("total uncompressed bytes in concatenated test file", 5346, totalBytes);
    assertEquals("total uncompressed lines in concatenated test file", 84, lineNum);
    ZlibFactory.loadNativeZLib();
    // test GzipZlibDecompressor (native), just to be sure
    // (FIXME?  could move this call to testGzip(), but would need filename
    // setup above) (alternatively, maybe just nuke testGzip() and extend this?)
    doMultipleGzipBufferSizes(jobConf, true);
}
Also used : Path(org.apache.hadoop.fs.Path) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) Text(org.apache.hadoop.io.Text) FileInputStream(java.io.FileInputStream) LineReader(org.apache.hadoop.util.LineReader) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Test(org.junit.Test)

Example 10 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class TestConcatenatedCompressedInput method testPrototypeInflaterGzip.

/**
   * Test using the raw Inflater codec for reading gzip files.
   */
@Test
public void testPrototypeInflaterGzip() throws IOException {
    // used only for file extension
    CompressionCodec gzip = new GzipCodec();
    // localFs = FileSystem instance
    localFs.delete(workDir, true);
    System.out.println(COLOR_BR_BLUE + "testPrototypeInflaterGzip() using " + "non-native/Java Inflater and manual gzip header/trailer parsing" + COLOR_NORMAL);
    // copy prebuilt (correct!) version of concat.gz to HDFS
    final String fn = "concat" + gzip.getDefaultExtension();
    Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
    Path fnHDFS = new Path(workDir, fn);
    localFs.copyFromLocalFile(fnLocal, fnHDFS);
    final FileInputStream in = new FileInputStream(fnLocal.toString());
    assertEquals("concat bytes available", 148, in.available());
    // should wrap all of this header-reading stuff in a running-CRC wrapper
    // (did so in BuiltInGzipDecompressor; see below)
    byte[] compressedBuf = new byte[256];
    int numBytesRead = in.read(compressedBuf, 0, 10);
    assertEquals("header bytes read", 10, numBytesRead);
    assertEquals("1st byte", 0x1f, compressedBuf[0] & 0xff);
    assertEquals("2nd byte", 0x8b, compressedBuf[1] & 0xff);
    assertEquals("3rd byte (compression method)", 8, compressedBuf[2] & 0xff);
    byte flags = (byte) (compressedBuf[3] & 0xff);
    if ((flags & 0x04) != 0) {
        // FEXTRA
        numBytesRead = in.read(compressedBuf, 0, 2);
        assertEquals("XLEN bytes read", 2, numBytesRead);
        int xlen = ((compressedBuf[1] << 8) | compressedBuf[0]) & 0xffff;
        in.skip(xlen);
    }
    if ((flags & 0x08) != 0) {
        // FNAME
        while ((numBytesRead = in.read()) != 0) {
            assertFalse("unexpected end-of-file while reading filename", numBytesRead == -1);
        }
    }
    if ((flags & 0x10) != 0) {
        // FCOMMENT
        while ((numBytesRead = in.read()) != 0) {
            assertFalse("unexpected end-of-file while reading comment", numBytesRead == -1);
        }
    }
    if ((flags & 0xe0) != 0) {
        // reserved
        assertTrue("reserved bits are set??", (flags & 0xe0) == 0);
    }
    if ((flags & 0x02) != 0) {
        // FHCRC
        numBytesRead = in.read(compressedBuf, 0, 2);
        assertEquals("CRC16 bytes read", 2, numBytesRead);
        int crc16 = ((compressedBuf[1] << 8) | compressedBuf[0]) & 0xffff;
    }
    // ready to go!  next bytes should be start of deflated stream, suitable
    // for Inflater
    numBytesRead = in.read(compressedBuf);
    // Inflater docs refer to a "dummy byte":  no clue what that's about;
    // appears to work fine without one
    byte[] uncompressedBuf = new byte[256];
    Inflater inflater = new Inflater(true);
    inflater.setInput(compressedBuf, 0, numBytesRead);
    try {
        int numBytesUncompressed = inflater.inflate(uncompressedBuf);
        String outString = new String(uncompressedBuf, 0, numBytesUncompressed, "UTF-8");
        System.out.println("uncompressed data of first gzip member = [" + outString + "]");
    } catch (java.util.zip.DataFormatException ex) {
        throw new IOException(ex.getMessage());
    }
    in.close();
}
Also used : Path(org.apache.hadoop.fs.Path) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) Inflater(java.util.zip.Inflater) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Test(org.junit.Test)

Aggregations

CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)110 Path (org.apache.hadoop.fs.Path)53 FileSystem (org.apache.hadoop.fs.FileSystem)41 Configuration (org.apache.hadoop.conf.Configuration)37 CompressionCodecFactory (org.apache.hadoop.io.compress.CompressionCodecFactory)36 InputStream (java.io.InputStream)17 Test (org.junit.Test)17 IOException (java.io.IOException)16 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)14 Text (org.apache.hadoop.io.Text)14 Configurable (org.apache.hadoop.conf.Configurable)10 GzipCodec (org.apache.hadoop.io.compress.GzipCodec)10 JobConf (org.apache.hadoop.mapred.JobConf)10 SequenceFile (org.apache.hadoop.io.SequenceFile)9 OutputStream (java.io.OutputStream)8 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)8 FileInputStream (java.io.FileInputStream)7 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)6 CompressionInputStream (org.apache.hadoop.io.compress.CompressionInputStream)6 ByteString (com.google.protobuf.ByteString)5