Search in sources :

Example 11 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class TestConcatenatedCompressedInput method testMoreBzip2.

/**
   * Extended bzip2 test, similar to BuiltInGzipDecompressor test above.
   */
@Test
public void testMoreBzip2() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);
    CompressionCodec bzip2 = new BZip2Codec();
    ReflectionUtils.setConf(bzip2, jobConf);
    localFs.delete(workDir, true);
    System.out.println(COLOR_BR_MAGENTA + "testMoreBzip2() using non-native CBZip2InputStream (presumably)" + COLOR_NORMAL);
    // copy single-member test file to HDFS
    String fn1 = "testConcatThenCompress.txt" + bzip2.getDefaultExtension();
    Path fnLocal1 = new Path(System.getProperty("test.concat.data", "/tmp"), fn1);
    Path fnHDFS1 = new Path(workDir, fn1);
    localFs.copyFromLocalFile(fnLocal1, fnHDFS1);
    // copy multiple-member test file to HDFS
    String fn2 = "testCompressThenConcat.txt" + bzip2.getDefaultExtension();
    Path fnLocal2 = new Path(System.getProperty("test.concat.data", "/tmp"), fn2);
    Path fnHDFS2 = new Path(workDir, fn2);
    localFs.copyFromLocalFile(fnLocal2, fnHDFS2);
    FileInputFormat.setInputPaths(jobConf, workDir);
    // here's first pair of BlockDecompressorStreams:
    final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
    final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
    assertEquals("concat bytes available", 2567, in1.available());
    assertEquals("concat bytes available", 3056, in2.available());
    /*
    // FIXME
    // The while-loop below dies at the beginning of the 2nd concatenated
    // member (after 17 lines successfully read) with:
    //
    //   java.io.IOException: bad block header
    //   at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.initBlock(
    //   CBZip2InputStream.java:527)
    //
    // It is not critical to concatenated-gzip support, HADOOP-6835, so it's
    // simply commented out for now (and HADOOP-6852 filed).  If and when the
    // latter issue is resolved--perhaps by fixing an error here--this code
    // should be reenabled.  Note that the doMultipleBzip2BufferSizes() test
    // below uses the same testCompressThenConcat.txt.bz2 file but works fine.

    CompressionInputStream cin2 = bzip2.createInputStream(in2);
    LineReader in = new LineReader(cin2);
    Text out = new Text();

    int numBytes, totalBytes=0, lineNum=0;
    while ((numBytes = in.readLine(out)) > 0) {
      ++lineNum;
      totalBytes += numBytes;
    }
    in.close();
    assertEquals("total uncompressed bytes in concatenated test file",
                 5346, totalBytes);
    assertEquals("total uncompressed lines in concatenated test file",
                 84, lineNum);
 */
    // test CBZip2InputStream with lots of different input-buffer sizes
    doMultipleBzip2BufferSizes(jobConf);
}
Also used : Path(org.apache.hadoop.fs.Path) BZip2Codec(org.apache.hadoop.io.compress.BZip2Codec) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) FileInputStream(java.io.FileInputStream) Test(org.junit.Test)

Example 12 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class TestConcatenatedCompressedInput method testGzip.

/**
   * Test using Hadoop's original, native-zlib gzip codec for reading.
   */
@Test
public void testGzip() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);
    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, jobConf);
    localFs.delete(workDir, true);
    // alternative:
    if (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class == gzip.getDecompressorType()) {
        System.out.println(COLOR_BR_RED + "testGzip() using native-zlib Decompressor (" + gzip.getDecompressorType() + ")" + COLOR_NORMAL);
    } else {
        LOG.warn("testGzip() skipped:  native (C/C++) libs not loaded");
        return;
    }
    /*
 *      // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs--
 *      //                see https://issues.apache.org/jira/browse/HADOOP-6799
 *  Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension());
 *  //OutputStream out = localFs.create(fnHDFS);
 *  //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out);
 *      // can just combine those two lines, probably
 *  //GzipCodec.GzipOutputStream gzOStm =
 *  //  new GzipCodec.GzipOutputStream(localFs.create(fnHDFS));
 *      // oops, no:  this is a protected helper class; need to access
 *      //   it via createOutputStream() instead:
 *  OutputStream out = localFs.create(fnHDFS);
 *  Compressor gzCmp = gzip.createCompressor();
 *  CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp);
 *      // this SHOULD be going to HDFS:  got out from localFs == HDFS
 *      //   ...yup, works
 *  gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes());
 *  gzOStm.finish();
 *  gzOStm.resetState();
 *  gzOStm.write("2nd gzip concat member\n".getBytes());
 *  gzOStm.finish();
 *  gzOStm.resetState();
 *  gzOStm.write("gzip concat\nmember #3\n".getBytes());
 *  gzOStm.close();
 *      //
 *  String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension();
 *  Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn);
 *  localFs.copyToLocalFile(fnHDFS, fnLocal);
 */
    // copy prebuilt (correct!) version of concat.gz to HDFS
    final String fn = "concat" + gzip.getDefaultExtension();
    Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
    Path fnHDFS = new Path(workDir, fn);
    localFs.copyFromLocalFile(fnLocal, fnHDFS);
    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n");
    FileInputFormat.setInputPaths(jobConf, workDir);
    TextInputFormat format = new TextInputFormat();
    format.configure(jobConf);
    InputSplit[] splits = format.getSplits(jobConf, 100);
    assertEquals("compressed splits == 2", 2, splits.length);
    FileSplit tmp = (FileSplit) splits[0];
    if (tmp.getPath().getName().equals("part2.txt.gz")) {
        splits[0] = splits[1];
        splits[1] = tmp;
    }
    List<Text> results = readSplit(format, splits[0], jobConf);
    assertEquals("splits[0] num lines", 6, results.size());
    assertEquals("splits[0][5]", "member #3", results.get(5).toString());
    results = readSplit(format, splits[1], jobConf);
    assertEquals("splits[1] num lines", 2, results.size());
    assertEquals("splits[1][0]", "this is a test", results.get(0).toString());
    assertEquals("splits[1][1]", "of gzip", results.get(1).toString());
}
Also used : Path(org.apache.hadoop.fs.Path) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) Text(org.apache.hadoop.io.Text) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Test(org.junit.Test)

Example 13 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class FixedLengthRecordReader method initialize.

// This is also called from the old FixedLengthRecordReader API implementation
public void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException {
    start = splitStart;
    end = start + splitLength;
    long partialRecordLength = start % recordLength;
    long numBytesToSkip = 0;
    if (partialRecordLength != 0) {
        numBytesToSkip = recordLength - partialRecordLength;
    }
    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor);
        filePosition = cIn;
        inputStream = cIn;
        numRecordsRemainingInSplit = Long.MAX_VALUE;
        LOG.info("Compressed input; cannot compute number of records in the split");
    } else {
        fileIn.seek(start);
        filePosition = fileIn;
        inputStream = fileIn;
        long splitSize = end - start - numBytesToSkip;
        numRecordsRemainingInSplit = (splitSize + recordLength - 1) / recordLength;
        if (numRecordsRemainingInSplit < 0) {
            numRecordsRemainingInSplit = 0;
        }
        LOG.info("Expecting " + numRecordsRemainingInSplit + " records each with a length of " + recordLength + " bytes in the split with an effective size of " + splitSize + " bytes");
    }
    if (numBytesToSkip != 0) {
        start += inputStream.skip(numBytesToSkip);
    }
    this.pos = start;
}
Also used : CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) CompressionInputStream(org.apache.hadoop.io.compress.CompressionInputStream) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 14 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class LineRecordReader method initialize.

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (start != 0) {
                // a Compression codec that cannot be split.
                throw new IOException("Cannot seek in " + codec.getClass().getSimpleName() + " compressed stream");
            }
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new UncompressedSplitLineReader(fileIn, job, this.recordDelimiterBytes, split.getLength());
        filePosition = fileIn;
    }
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}
Also used : Path(org.apache.hadoop.fs.Path) SplittableCompressionCodec(org.apache.hadoop.io.compress.SplittableCompressionCodec) Configuration(org.apache.hadoop.conf.Configuration) SplitCompressionInputStream(org.apache.hadoop.io.compress.SplitCompressionInputStream) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) SplittableCompressionCodec(org.apache.hadoop.io.compress.SplittableCompressionCodec)

Example 15 with CompressionCodec

use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.

the class DefaultOutputter method init.

@Override
public void init(Path path, Configuration conf) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(path);
    OutputStream output;
    if (codec != null) {
        compressor = CodecPool.getCompressor(codec);
        output = codec.createOutputStream(fs.create(path), compressor);
    } else {
        output = fs.create(path);
    }
    writer = new JsonObjectMapperWriter<T>(output, conf.getBoolean("rumen.output.pretty.print", true));
}
Also used : CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) OutputStream(java.io.OutputStream) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Aggregations

CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)110 Path (org.apache.hadoop.fs.Path)53 FileSystem (org.apache.hadoop.fs.FileSystem)41 Configuration (org.apache.hadoop.conf.Configuration)37 CompressionCodecFactory (org.apache.hadoop.io.compress.CompressionCodecFactory)36 InputStream (java.io.InputStream)17 Test (org.junit.Test)17 IOException (java.io.IOException)16 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)14 Text (org.apache.hadoop.io.Text)14 Configurable (org.apache.hadoop.conf.Configurable)10 GzipCodec (org.apache.hadoop.io.compress.GzipCodec)10 JobConf (org.apache.hadoop.mapred.JobConf)10 SequenceFile (org.apache.hadoop.io.SequenceFile)9 OutputStream (java.io.OutputStream)8 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)8 FileInputStream (java.io.FileInputStream)7 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)6 CompressionInputStream (org.apache.hadoop.io.compress.CompressionInputStream)6 ByteString (com.google.protobuf.ByteString)5