use of org.apache.hadoop.io.compress.BZip2Codec in project hadoop by apache.
the class TestLineRecordReader method testMultipleClose.
@Test
public void testMultipleClose() throws IOException {
URL testFileUrl = getClass().getClassLoader().getResource("recordSpanningMultipleSplits.txt.bz2");
assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2", testFileUrl);
File testFile = new File(testFileUrl.getFile());
Path testFilePath = new Path(testFile.getAbsolutePath());
long testFileSize = testFile.length();
Configuration conf = new Configuration();
conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
// read the data and check whether BOM is skipped
FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null);
LineRecordReader reader = new LineRecordReader();
reader.initialize(split, context);
//noinspection StatementWithEmptyBody
while (reader.nextKeyValue()) ;
reader.close();
reader.close();
BZip2Codec codec = new BZip2Codec();
codec.setConf(conf);
Set<Decompressor> decompressors = new HashSet<Decompressor>();
for (int i = 0; i < 10; ++i) {
decompressors.add(CodecPool.getDecompressor(codec));
}
assertEquals(10, decompressors.size());
}
use of org.apache.hadoop.io.compress.BZip2Codec in project hadoop by apache.
the class TestLineRecordReader method testMultipleClose.
@Test
public void testMultipleClose() throws IOException {
URL testFileUrl = getClass().getClassLoader().getResource("recordSpanningMultipleSplits.txt.bz2");
assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2", testFileUrl);
File testFile = new File(testFileUrl.getFile());
Path testFilePath = new Path(testFile.getAbsolutePath());
long testFileSize = testFile.length();
Configuration conf = new Configuration();
conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (String[]) null);
LineRecordReader reader = new LineRecordReader(conf, split);
LongWritable key = new LongWritable();
Text value = new Text();
//noinspection StatementWithEmptyBody
while (reader.next(key, value)) ;
reader.close();
reader.close();
BZip2Codec codec = new BZip2Codec();
codec.setConf(conf);
Set<Decompressor> decompressors = new HashSet<Decompressor>();
for (int i = 0; i < 10; ++i) {
decompressors.add(CodecPool.getDecompressor(codec));
}
assertEquals(10, decompressors.size());
}
use of org.apache.hadoop.io.compress.BZip2Codec in project hadoop by apache.
the class TestConcatenatedCompressedInput method testMoreBzip2.
/**
* Extended bzip2 test, similar to BuiltInGzipDecompressor test above.
*/
@Test
public void testMoreBzip2() throws IOException {
JobConf jobConf = new JobConf(defaultConf);
CompressionCodec bzip2 = new BZip2Codec();
ReflectionUtils.setConf(bzip2, jobConf);
localFs.delete(workDir, true);
System.out.println(COLOR_BR_MAGENTA + "testMoreBzip2() using non-native CBZip2InputStream (presumably)" + COLOR_NORMAL);
// copy single-member test file to HDFS
String fn1 = "testConcatThenCompress.txt" + bzip2.getDefaultExtension();
Path fnLocal1 = new Path(System.getProperty("test.concat.data", "/tmp"), fn1);
Path fnHDFS1 = new Path(workDir, fn1);
localFs.copyFromLocalFile(fnLocal1, fnHDFS1);
// copy multiple-member test file to HDFS
String fn2 = "testCompressThenConcat.txt" + bzip2.getDefaultExtension();
Path fnLocal2 = new Path(System.getProperty("test.concat.data", "/tmp"), fn2);
Path fnHDFS2 = new Path(workDir, fn2);
localFs.copyFromLocalFile(fnLocal2, fnHDFS2);
FileInputFormat.setInputPaths(jobConf, workDir);
// here's first pair of BlockDecompressorStreams:
final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
assertEquals("concat bytes available", 2567, in1.available());
assertEquals("concat bytes available", 3056, in2.available());
/*
// FIXME
// The while-loop below dies at the beginning of the 2nd concatenated
// member (after 17 lines successfully read) with:
//
// java.io.IOException: bad block header
// at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.initBlock(
// CBZip2InputStream.java:527)
//
// It is not critical to concatenated-gzip support, HADOOP-6835, so it's
// simply commented out for now (and HADOOP-6852 filed). If and when the
// latter issue is resolved--perhaps by fixing an error here--this code
// should be reenabled. Note that the doMultipleBzip2BufferSizes() test
// below uses the same testCompressThenConcat.txt.bz2 file but works fine.
CompressionInputStream cin2 = bzip2.createInputStream(in2);
LineReader in = new LineReader(cin2);
Text out = new Text();
int numBytes, totalBytes=0, lineNum=0;
while ((numBytes = in.readLine(out)) > 0) {
++lineNum;
totalBytes += numBytes;
}
in.close();
assertEquals("total uncompressed bytes in concatenated test file",
5346, totalBytes);
assertEquals("total uncompressed lines in concatenated test file",
84, lineNum);
*/
// test CBZip2InputStream with lots of different input-buffer sizes
doMultipleBzip2BufferSizes(jobConf);
}
use of org.apache.hadoop.io.compress.BZip2Codec in project carbondata by apache.
the class CSVInputFormatTest method generateCompressFiles.
/**
* generate compressed files, no need to call this method.
* @throws Exception
*/
public void generateCompressFiles() throws Exception {
String pwd = new File("src/test/resources/csv").getCanonicalPath();
String inputFile = pwd + "/data.csv";
FileInputStream input = new FileInputStream(inputFile);
Configuration conf = new Configuration();
// .gz
String outputFile = pwd + "/data.csv.gz";
FileOutputStream output = new FileOutputStream(outputFile);
GzipCodec gzip = new GzipCodec();
gzip.setConf(conf);
CompressionOutputStream outputStream = gzip.createOutputStream(output);
int i = -1;
while ((i = input.read()) != -1) {
outputStream.write(i);
}
outputStream.close();
input.close();
// .bz2
input = new FileInputStream(inputFile);
outputFile = pwd + "/data.csv.bz2";
output = new FileOutputStream(outputFile);
BZip2Codec bzip2 = new BZip2Codec();
bzip2.setConf(conf);
outputStream = bzip2.createOutputStream(output);
i = -1;
while ((i = input.read()) != -1) {
outputStream.write(i);
}
outputStream.close();
input.close();
// .snappy
input = new FileInputStream(inputFile);
outputFile = pwd + "/data.csv.snappy";
output = new FileOutputStream(outputFile);
SnappyCodec snappy = new SnappyCodec();
snappy.setConf(conf);
outputStream = snappy.createOutputStream(output);
i = -1;
while ((i = input.read()) != -1) {
outputStream.write(i);
}
outputStream.close();
input.close();
//.lz4
input = new FileInputStream(inputFile);
outputFile = pwd + "/data.csv.lz4";
output = new FileOutputStream(outputFile);
Lz4Codec lz4 = new Lz4Codec();
lz4.setConf(conf);
outputStream = lz4.createOutputStream(output);
i = -1;
while ((i = input.read()) != -1) {
outputStream.write(i);
}
outputStream.close();
input.close();
}
use of org.apache.hadoop.io.compress.BZip2Codec in project hadoop by apache.
the class TestConcatenatedCompressedInput method testBzip2.
/**
* Test using the bzip2 codec for reading
*/
@Test
public void testBzip2() throws IOException {
JobConf jobConf = new JobConf(defaultConf);
CompressionCodec bzip2 = new BZip2Codec();
ReflectionUtils.setConf(bzip2, jobConf);
localFs.delete(workDir, true);
System.out.println(COLOR_BR_CYAN + "testBzip2() using non-native CBZip2InputStream (presumably)" + COLOR_NORMAL);
// copy prebuilt (correct!) version of concat.bz2 to HDFS
final String fn = "concat" + bzip2.getDefaultExtension();
Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
Path fnHDFS = new Path(workDir, fn);
localFs.copyFromLocalFile(fnLocal, fnHDFS);
writeFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n");
FileInputFormat.setInputPaths(jobConf, workDir);
// extends FileInputFormat
TextInputFormat format = new TextInputFormat();
format.configure(jobConf);
// work around 2-byte splits issue
format.setMinSplitSize(256);
// [135 splits for a 208-byte file and a 62-byte file(!)]
InputSplit[] splits = format.getSplits(jobConf, 100);
assertEquals("compressed splits == 2", 2, splits.length);
FileSplit tmp = (FileSplit) splits[0];
if (tmp.getPath().getName().equals("part2.txt.bz2")) {
splits[0] = splits[1];
splits[1] = tmp;
}
List<Text> results = readSplit(format, splits[0], jobConf);
assertEquals("splits[0] num lines", 6, results.size());
assertEquals("splits[0][5]", "member #3", results.get(5).toString());
results = readSplit(format, splits[1], jobConf);
assertEquals("splits[1] num lines", 2, results.size());
assertEquals("splits[1][0]", "this is a test", results.get(0).toString());
assertEquals("splits[1][1]", "of bzip2", results.get(1).toString());
}
Aggregations