use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class TestConcatenatedCompressedInput method testMoreBzip2.
/**
* Extended bzip2 test, similar to BuiltInGzipDecompressor test above.
*/
@Test
public void testMoreBzip2() throws IOException {
JobConf jobConf = new JobConf(defaultConf);
CompressionCodec bzip2 = new BZip2Codec();
ReflectionUtils.setConf(bzip2, jobConf);
localFs.delete(workDir, true);
System.out.println(COLOR_BR_MAGENTA + "testMoreBzip2() using non-native CBZip2InputStream (presumably)" + COLOR_NORMAL);
// copy single-member test file to HDFS
String fn1 = "testConcatThenCompress.txt" + bzip2.getDefaultExtension();
Path fnLocal1 = new Path(System.getProperty("test.concat.data", "/tmp"), fn1);
Path fnHDFS1 = new Path(workDir, fn1);
localFs.copyFromLocalFile(fnLocal1, fnHDFS1);
// copy multiple-member test file to HDFS
String fn2 = "testCompressThenConcat.txt" + bzip2.getDefaultExtension();
Path fnLocal2 = new Path(System.getProperty("test.concat.data", "/tmp"), fn2);
Path fnHDFS2 = new Path(workDir, fn2);
localFs.copyFromLocalFile(fnLocal2, fnHDFS2);
FileInputFormat.setInputPaths(jobConf, workDir);
// here's first pair of BlockDecompressorStreams:
final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
assertEquals("concat bytes available", 2567, in1.available());
assertEquals("concat bytes available", 3056, in2.available());
/*
// FIXME
// The while-loop below dies at the beginning of the 2nd concatenated
// member (after 17 lines successfully read) with:
//
// java.io.IOException: bad block header
// at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.initBlock(
// CBZip2InputStream.java:527)
//
// It is not critical to concatenated-gzip support, HADOOP-6835, so it's
// simply commented out for now (and HADOOP-6852 filed). If and when the
// latter issue is resolved--perhaps by fixing an error here--this code
// should be reenabled. Note that the doMultipleBzip2BufferSizes() test
// below uses the same testCompressThenConcat.txt.bz2 file but works fine.
CompressionInputStream cin2 = bzip2.createInputStream(in2);
LineReader in = new LineReader(cin2);
Text out = new Text();
int numBytes, totalBytes=0, lineNum=0;
while ((numBytes = in.readLine(out)) > 0) {
++lineNum;
totalBytes += numBytes;
}
in.close();
assertEquals("total uncompressed bytes in concatenated test file",
5346, totalBytes);
assertEquals("total uncompressed lines in concatenated test file",
84, lineNum);
*/
// test CBZip2InputStream with lots of different input-buffer sizes
doMultipleBzip2BufferSizes(jobConf);
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class TestConcatenatedCompressedInput method testGzip.
/**
* Test using Hadoop's original, native-zlib gzip codec for reading.
*/
@Test
public void testGzip() throws IOException {
JobConf jobConf = new JobConf(defaultConf);
CompressionCodec gzip = new GzipCodec();
ReflectionUtils.setConf(gzip, jobConf);
localFs.delete(workDir, true);
// alternative:
if (org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class == gzip.getDecompressorType()) {
System.out.println(COLOR_BR_RED + "testGzip() using native-zlib Decompressor (" + gzip.getDecompressorType() + ")" + COLOR_NORMAL);
} else {
LOG.warn("testGzip() skipped: native (C/C++) libs not loaded");
return;
}
/*
* // THIS IS BUGGY: omits 2nd/3rd gzip headers; screws up 2nd/3rd CRCs--
* // see https://issues.apache.org/jira/browse/HADOOP-6799
* Path fnHDFS = new Path(workDir, "concat" + gzip.getDefaultExtension());
* //OutputStream out = localFs.create(fnHDFS);
* //GzipCodec.GzipOutputStream gzOStm = new GzipCodec.GzipOutputStream(out);
* // can just combine those two lines, probably
* //GzipCodec.GzipOutputStream gzOStm =
* // new GzipCodec.GzipOutputStream(localFs.create(fnHDFS));
* // oops, no: this is a protected helper class; need to access
* // it via createOutputStream() instead:
* OutputStream out = localFs.create(fnHDFS);
* Compressor gzCmp = gzip.createCompressor();
* CompressionOutputStream gzOStm = gzip.createOutputStream(out, gzCmp);
* // this SHOULD be going to HDFS: got out from localFs == HDFS
* // ...yup, works
* gzOStm.write("first gzip concat\n member\nwith three lines\n".getBytes());
* gzOStm.finish();
* gzOStm.resetState();
* gzOStm.write("2nd gzip concat member\n".getBytes());
* gzOStm.finish();
* gzOStm.resetState();
* gzOStm.write("gzip concat\nmember #3\n".getBytes());
* gzOStm.close();
* //
* String fn = "hdfs-to-local-concat" + gzip.getDefaultExtension();
* Path fnLocal = new Path(System.getProperty("test.concat.data","/tmp"), fn);
* localFs.copyToLocalFile(fnHDFS, fnLocal);
*/
// copy prebuilt (correct!) version of concat.gz to HDFS
final String fn = "concat" + gzip.getDefaultExtension();
Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
Path fnHDFS = new Path(workDir, fn);
localFs.copyFromLocalFile(fnLocal, fnHDFS);
writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n");
FileInputFormat.setInputPaths(jobConf, workDir);
TextInputFormat format = new TextInputFormat();
format.configure(jobConf);
InputSplit[] splits = format.getSplits(jobConf, 100);
assertEquals("compressed splits == 2", 2, splits.length);
FileSplit tmp = (FileSplit) splits[0];
if (tmp.getPath().getName().equals("part2.txt.gz")) {
splits[0] = splits[1];
splits[1] = tmp;
}
List<Text> results = readSplit(format, splits[0], jobConf);
assertEquals("splits[0] num lines", 6, results.size());
assertEquals("splits[0][5]", "member #3", results.get(5).toString());
results = readSplit(format, splits[1], jobConf);
assertEquals("splits[1] num lines", 2, results.size());
assertEquals("splits[1][0]", "this is a test", results.get(0).toString());
assertEquals("splits[1][1]", "of gzip", results.get(1).toString());
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class FixedLengthRecordReader method initialize.
// This is also called from the old FixedLengthRecordReader API implementation
public void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException {
start = splitStart;
end = start + splitLength;
long partialRecordLength = start % recordLength;
long numBytesToSkip = 0;
if (partialRecordLength != 0) {
numBytesToSkip = recordLength - partialRecordLength;
}
// open the file and seek to the start of the split
final FileSystem fs = file.getFileSystem(job);
fileIn = fs.open(file);
CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
if (null != codec) {
isCompressedInput = true;
decompressor = CodecPool.getDecompressor(codec);
CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor);
filePosition = cIn;
inputStream = cIn;
numRecordsRemainingInSplit = Long.MAX_VALUE;
LOG.info("Compressed input; cannot compute number of records in the split");
} else {
fileIn.seek(start);
filePosition = fileIn;
inputStream = fileIn;
long splitSize = end - start - numBytesToSkip;
numRecordsRemainingInSplit = (splitSize + recordLength - 1) / recordLength;
if (numRecordsRemainingInSplit < 0) {
numRecordsRemainingInSplit = 0;
}
LOG.info("Expecting " + numRecordsRemainingInSplit + " records each with a length of " + recordLength + " bytes in the split with an effective size of " + splitSize + " bytes");
}
if (numBytesToSkip != 0) {
start += inputStream.skip(numBytesToSkip);
}
this.pos = start;
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class LineRecordReader method initialize.
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
start = split.getStart();
end = start + split.getLength();
final Path file = split.getPath();
// open the file and seek to the start of the split
final FileSystem fs = file.getFileSystem(job);
fileIn = fs.open(file);
CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
if (null != codec) {
isCompressedInput = true;
decompressor = CodecPool.getDecompressor(codec);
if (codec instanceof SplittableCompressionCodec) {
final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
start = cIn.getAdjustedStart();
end = cIn.getAdjustedEnd();
filePosition = cIn;
} else {
if (start != 0) {
// a Compression codec that cannot be split.
throw new IOException("Cannot seek in " + codec.getClass().getSimpleName() + " compressed stream");
}
in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes);
filePosition = fileIn;
}
} else {
fileIn.seek(start);
in = new UncompressedSplitLineReader(fileIn, job, this.recordDelimiterBytes, split.getLength());
filePosition = fileIn;
}
// next() method.
if (start != 0) {
start += in.readLine(new Text(), 0, maxBytesToConsume(start));
}
this.pos = start;
}
use of org.apache.hadoop.io.compress.CompressionCodec in project hadoop by apache.
the class DefaultOutputter method init.
@Override
public void init(Path path, Configuration conf) throws IOException {
FileSystem fs = path.getFileSystem(conf);
CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(path);
OutputStream output;
if (codec != null) {
compressor = CodecPool.getCompressor(codec);
output = codec.createOutputStream(fs.create(path), compressor);
} else {
output = fs.create(path);
}
writer = new JsonObjectMapperWriter<T>(output, conf.getBoolean("rumen.output.pretty.print", true));
}
Aggregations