use of org.apache.hadoop.io.compress.CompressionInputStream in project hadoop by apache.
the class CompressionEmulationUtil method getPossiblyDecompressedInputStream.
/**
* Returns a {@link InputStream} for a file that might be compressed.
*/
static InputStream getPossiblyDecompressedInputStream(Path file, Configuration conf, long offset) throws IOException {
FileSystem fs = file.getFileSystem(conf);
if (isCompressionEmulationEnabled(conf) && isInputCompressionEmulationEnabled(conf)) {
CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
CompressionCodec codec = compressionCodecs.getCodec(file);
if (codec != null) {
Decompressor decompressor = CodecPool.getDecompressor(codec);
if (decompressor != null) {
CompressionInputStream in = codec.createInputStream(fs.open(file), decompressor);
// Use SplittableCompressionCodec?
return (InputStream) in;
}
}
}
FSDataInputStream in = fs.open(file);
in.seek(offset);
return (InputStream) in;
}
use of org.apache.hadoop.io.compress.CompressionInputStream in project hadoop by apache.
the class TestConcatenatedCompressedInput method testBuiltInGzipDecompressor.
/**
* Test using the new BuiltInGzipDecompressor codec for reading gzip files.
*/
// NOTE: This fails on RHEL4 with "java.io.IOException: header crc mismatch"
// due to buggy version of zlib (1.2.1.2) included.
@Test
public void testBuiltInGzipDecompressor() throws IOException {
JobConf jobConf = new JobConf(defaultConf);
CompressionCodec gzip = new GzipCodec();
ReflectionUtils.setConf(gzip, jobConf);
localFs.delete(workDir, true);
// Don't use native libs for this test
ZlibFactory.setNativeZlibLoaded(false);
assertEquals("[non-native (Java) codec]", org.apache.hadoop.io.compress.zlib.BuiltInGzipDecompressor.class, gzip.getDecompressorType());
System.out.println(COLOR_BR_YELLOW + "testBuiltInGzipDecompressor() using" + " non-native (Java Inflater) Decompressor (" + gzip.getDecompressorType() + ")" + COLOR_NORMAL);
// copy single-member test file to HDFS
String fn1 = "testConcatThenCompress.txt" + gzip.getDefaultExtension();
Path fnLocal1 = new Path(System.getProperty("test.concat.data", "/tmp"), fn1);
Path fnHDFS1 = new Path(workDir, fn1);
localFs.copyFromLocalFile(fnLocal1, fnHDFS1);
// copy multiple-member test file to HDFS
// (actually in "seekable gzip" format, a la JIRA PIG-42)
String fn2 = "testCompressThenConcat.txt" + gzip.getDefaultExtension();
Path fnLocal2 = new Path(System.getProperty("test.concat.data", "/tmp"), fn2);
Path fnHDFS2 = new Path(workDir, fn2);
localFs.copyFromLocalFile(fnLocal2, fnHDFS2);
FileInputFormat.setInputPaths(jobConf, workDir);
// here's first pair of DecompressorStreams:
final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
assertEquals("concat bytes available", 2734, in1.available());
// w/hdr CRC
assertEquals("concat bytes available", 3413, in2.available());
CompressionInputStream cin2 = gzip.createInputStream(in2);
LineReader in = new LineReader(cin2);
Text out = new Text();
int numBytes, totalBytes = 0, lineNum = 0;
while ((numBytes = in.readLine(out)) > 0) {
++lineNum;
totalBytes += numBytes;
}
in.close();
assertEquals("total uncompressed bytes in concatenated test file", 5346, totalBytes);
assertEquals("total uncompressed lines in concatenated test file", 84, lineNum);
ZlibFactory.loadNativeZLib();
// test GzipZlibDecompressor (native), just to be sure
// (FIXME? could move this call to testGzip(), but would need filename
// setup above) (alternatively, maybe just nuke testGzip() and extend this?)
doMultipleGzipBufferSizes(jobConf, true);
}
use of org.apache.hadoop.io.compress.CompressionInputStream in project hadoop by apache.
the class FixedLengthRecordReader method initialize.
// This is also called from the old FixedLengthRecordReader API implementation
public void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException {
start = splitStart;
end = start + splitLength;
long partialRecordLength = start % recordLength;
long numBytesToSkip = 0;
if (partialRecordLength != 0) {
numBytesToSkip = recordLength - partialRecordLength;
}
// open the file and seek to the start of the split
final FileSystem fs = file.getFileSystem(job);
fileIn = fs.open(file);
CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
if (null != codec) {
isCompressedInput = true;
decompressor = CodecPool.getDecompressor(codec);
CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor);
filePosition = cIn;
inputStream = cIn;
numRecordsRemainingInSplit = Long.MAX_VALUE;
LOG.info("Compressed input; cannot compute number of records in the split");
} else {
fileIn.seek(start);
filePosition = fileIn;
inputStream = fileIn;
long splitSize = end - start - numBytesToSkip;
numRecordsRemainingInSplit = (splitSize + recordLength - 1) / recordLength;
if (numRecordsRemainingInSplit < 0) {
numRecordsRemainingInSplit = 0;
}
LOG.info("Expecting " + numRecordsRemainingInSplit + " records each with a length of " + recordLength + " bytes in the split with an effective size of " + splitSize + " bytes");
}
if (numBytesToSkip != 0) {
start += inputStream.skip(numBytesToSkip);
}
this.pos = start;
}
use of org.apache.hadoop.io.compress.CompressionInputStream in project hadoop by apache.
the class TestTextOutputFormat method testCompress.
/**
* test compressed file
* @throws IOException
*/
@Test
public void testCompress() throws IOException {
JobConf job = new JobConf();
job.set(JobContext.TASK_ATTEMPT_ID, attempt);
job.set(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.COMPRESS, "true");
FileOutputFormat.setOutputPath(job, workDir.getParent().getParent());
FileOutputFormat.setWorkOutputPath(job, workDir);
FileSystem fs = workDir.getFileSystem(job);
if (!fs.mkdirs(workDir)) {
fail("Failed to create output directory");
}
String file = "test_compress.txt";
// A reporter that does nothing
Reporter reporter = Reporter.NULL;
TextOutputFormat<Object, Object> theOutputFormat = new TextOutputFormat<Object, Object>();
RecordWriter<Object, Object> theRecordWriter = theOutputFormat.getRecordWriter(localFs, job, file, reporter);
Text key1 = new Text("key1");
Text key2 = new Text("key2");
Text val1 = new Text("val1");
Text val2 = new Text("val2");
NullWritable nullWritable = NullWritable.get();
try {
theRecordWriter.write(key1, val1);
theRecordWriter.write(null, nullWritable);
theRecordWriter.write(null, val1);
theRecordWriter.write(nullWritable, val2);
theRecordWriter.write(key2, nullWritable);
theRecordWriter.write(key1, null);
theRecordWriter.write(null, null);
theRecordWriter.write(key2, val2);
} finally {
theRecordWriter.close(reporter);
}
StringBuffer expectedOutput = new StringBuffer();
expectedOutput.append(key1).append("\t").append(val1).append("\n");
expectedOutput.append(val1).append("\n");
expectedOutput.append(val2).append("\n");
expectedOutput.append(key2).append("\n");
expectedOutput.append(key1).append("\n");
expectedOutput.append(key2).append("\t").append(val2).append("\n");
DefaultCodec codec = new DefaultCodec();
codec.setConf(job);
Path expectedFile = new Path(workDir, file + codec.getDefaultExtension());
final FileInputStream istream = new FileInputStream(expectedFile.toString());
CompressionInputStream cistream = codec.createInputStream(istream);
LineReader reader = new LineReader(cistream);
String output = "";
Text out = new Text();
while (reader.readLine(out) > 0) {
output += out;
output += "\n";
}
reader.close();
assertEquals(expectedOutput.toString(), output);
}
use of org.apache.hadoop.io.compress.CompressionInputStream in project hadoop by apache.
the class TestLz4CompressorDecompressor method testCompressorDecopressorLogicWithCompressionStreams.
// test compress/decompress process through CompressionOutputStream/CompressionInputStream api
@Test
public void testCompressorDecopressorLogicWithCompressionStreams() {
DataOutputStream deflateOut = null;
DataInputStream inflateIn = null;
int BYTE_SIZE = 1024 * 100;
byte[] bytes = generate(BYTE_SIZE);
int bufferSize = 262144;
int compressionOverhead = (bufferSize / 6) + 32;
try {
DataOutputBuffer compressedDataBuffer = new DataOutputBuffer();
CompressionOutputStream deflateFilter = new BlockCompressorStream(compressedDataBuffer, new Lz4Compressor(bufferSize), bufferSize, compressionOverhead);
deflateOut = new DataOutputStream(new BufferedOutputStream(deflateFilter));
deflateOut.write(bytes, 0, bytes.length);
deflateOut.flush();
deflateFilter.finish();
DataInputBuffer deCompressedDataBuffer = new DataInputBuffer();
deCompressedDataBuffer.reset(compressedDataBuffer.getData(), 0, compressedDataBuffer.getLength());
CompressionInputStream inflateFilter = new BlockDecompressorStream(deCompressedDataBuffer, new Lz4Decompressor(bufferSize), bufferSize);
inflateIn = new DataInputStream(new BufferedInputStream(inflateFilter));
byte[] result = new byte[BYTE_SIZE];
inflateIn.read(result);
assertArrayEquals("original array not equals compress/decompressed array", result, bytes);
} catch (IOException e) {
fail("testLz4CompressorDecopressorLogicWithCompressionStreams ex error !!!");
} finally {
try {
if (deflateOut != null)
deflateOut.close();
if (inflateIn != null)
inflateIn.close();
} catch (Exception e) {
}
}
}
Aggregations