Search in sources :

Example 1 with BZip2Codec

use of org.apache.hadoop.io.compress.BZip2Codec in project hadoop by apache.

the class TestLineRecordReader method testMultipleClose.

@Test
public void testMultipleClose() throws IOException {
    URL testFileUrl = getClass().getClassLoader().getResource("recordSpanningMultipleSplits.txt.bz2");
    assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2", testFileUrl);
    File testFile = new File(testFileUrl.getFile());
    Path testFilePath = new Path(testFile.getAbsolutePath());
    long testFileSize = testFile.length();
    Configuration conf = new Configuration();
    conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    // read the data and check whether BOM is skipped
    FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null);
    LineRecordReader reader = new LineRecordReader();
    reader.initialize(split, context);
    //noinspection StatementWithEmptyBody
    while (reader.nextKeyValue()) ;
    reader.close();
    reader.close();
    BZip2Codec codec = new BZip2Codec();
    codec.setConf(conf);
    Set<Decompressor> decompressors = new HashSet<Decompressor>();
    for (int i = 0; i < 10; ++i) {
        decompressors.add(CodecPool.getDecompressor(codec));
    }
    assertEquals(10, decompressors.size());
}
Also used : Path(org.apache.hadoop.fs.Path) Decompressor(org.apache.hadoop.io.compress.Decompressor) Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) BZip2Codec(org.apache.hadoop.io.compress.BZip2Codec) URL(java.net.URL) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) File(java.io.File) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 2 with BZip2Codec

use of org.apache.hadoop.io.compress.BZip2Codec in project hadoop by apache.

the class TestLineRecordReader method testMultipleClose.

@Test
public void testMultipleClose() throws IOException {
    URL testFileUrl = getClass().getClassLoader().getResource("recordSpanningMultipleSplits.txt.bz2");
    assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2", testFileUrl);
    File testFile = new File(testFileUrl.getFile());
    Path testFilePath = new Path(testFile.getAbsolutePath());
    long testFileSize = testFile.length();
    Configuration conf = new Configuration();
    conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (String[]) null);
    LineRecordReader reader = new LineRecordReader(conf, split);
    LongWritable key = new LongWritable();
    Text value = new Text();
    //noinspection StatementWithEmptyBody
    while (reader.next(key, value)) ;
    reader.close();
    reader.close();
    BZip2Codec codec = new BZip2Codec();
    codec.setConf(conf);
    Set<Decompressor> decompressors = new HashSet<Decompressor>();
    for (int i = 0; i < 10; ++i) {
        decompressors.add(CodecPool.getDecompressor(codec));
    }
    assertEquals(10, decompressors.size());
}
Also used : Path(org.apache.hadoop.fs.Path) Decompressor(org.apache.hadoop.io.compress.Decompressor) Configuration(org.apache.hadoop.conf.Configuration) Text(org.apache.hadoop.io.Text) BZip2Codec(org.apache.hadoop.io.compress.BZip2Codec) URL(java.net.URL) LongWritable(org.apache.hadoop.io.LongWritable) File(java.io.File) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 3 with BZip2Codec

use of org.apache.hadoop.io.compress.BZip2Codec in project hadoop by apache.

the class TestConcatenatedCompressedInput method testMoreBzip2.

/**
   * Extended bzip2 test, similar to BuiltInGzipDecompressor test above.
   */
@Test
public void testMoreBzip2() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);
    CompressionCodec bzip2 = new BZip2Codec();
    ReflectionUtils.setConf(bzip2, jobConf);
    localFs.delete(workDir, true);
    System.out.println(COLOR_BR_MAGENTA + "testMoreBzip2() using non-native CBZip2InputStream (presumably)" + COLOR_NORMAL);
    // copy single-member test file to HDFS
    String fn1 = "testConcatThenCompress.txt" + bzip2.getDefaultExtension();
    Path fnLocal1 = new Path(System.getProperty("test.concat.data", "/tmp"), fn1);
    Path fnHDFS1 = new Path(workDir, fn1);
    localFs.copyFromLocalFile(fnLocal1, fnHDFS1);
    // copy multiple-member test file to HDFS
    String fn2 = "testCompressThenConcat.txt" + bzip2.getDefaultExtension();
    Path fnLocal2 = new Path(System.getProperty("test.concat.data", "/tmp"), fn2);
    Path fnHDFS2 = new Path(workDir, fn2);
    localFs.copyFromLocalFile(fnLocal2, fnHDFS2);
    FileInputFormat.setInputPaths(jobConf, workDir);
    // here's first pair of BlockDecompressorStreams:
    final FileInputStream in1 = new FileInputStream(fnLocal1.toString());
    final FileInputStream in2 = new FileInputStream(fnLocal2.toString());
    assertEquals("concat bytes available", 2567, in1.available());
    assertEquals("concat bytes available", 3056, in2.available());
    /*
    // FIXME
    // The while-loop below dies at the beginning of the 2nd concatenated
    // member (after 17 lines successfully read) with:
    //
    //   java.io.IOException: bad block header
    //   at org.apache.hadoop.io.compress.bzip2.CBZip2InputStream.initBlock(
    //   CBZip2InputStream.java:527)
    //
    // It is not critical to concatenated-gzip support, HADOOP-6835, so it's
    // simply commented out for now (and HADOOP-6852 filed).  If and when the
    // latter issue is resolved--perhaps by fixing an error here--this code
    // should be reenabled.  Note that the doMultipleBzip2BufferSizes() test
    // below uses the same testCompressThenConcat.txt.bz2 file but works fine.

    CompressionInputStream cin2 = bzip2.createInputStream(in2);
    LineReader in = new LineReader(cin2);
    Text out = new Text();

    int numBytes, totalBytes=0, lineNum=0;
    while ((numBytes = in.readLine(out)) > 0) {
      ++lineNum;
      totalBytes += numBytes;
    }
    in.close();
    assertEquals("total uncompressed bytes in concatenated test file",
                 5346, totalBytes);
    assertEquals("total uncompressed lines in concatenated test file",
                 84, lineNum);
 */
    // test CBZip2InputStream with lots of different input-buffer sizes
    doMultipleBzip2BufferSizes(jobConf);
}
Also used : Path(org.apache.hadoop.fs.Path) BZip2Codec(org.apache.hadoop.io.compress.BZip2Codec) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) FileInputStream(java.io.FileInputStream) Test(org.junit.Test)

Example 4 with BZip2Codec

use of org.apache.hadoop.io.compress.BZip2Codec in project carbondata by apache.

the class CSVInputFormatTest method generateCompressFiles.

/**
   * generate compressed files, no need to call this method.
   * @throws Exception
   */
public void generateCompressFiles() throws Exception {
    String pwd = new File("src/test/resources/csv").getCanonicalPath();
    String inputFile = pwd + "/data.csv";
    FileInputStream input = new FileInputStream(inputFile);
    Configuration conf = new Configuration();
    // .gz
    String outputFile = pwd + "/data.csv.gz";
    FileOutputStream output = new FileOutputStream(outputFile);
    GzipCodec gzip = new GzipCodec();
    gzip.setConf(conf);
    CompressionOutputStream outputStream = gzip.createOutputStream(output);
    int i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();
    // .bz2
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.bz2";
    output = new FileOutputStream(outputFile);
    BZip2Codec bzip2 = new BZip2Codec();
    bzip2.setConf(conf);
    outputStream = bzip2.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();
    // .snappy
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.snappy";
    output = new FileOutputStream(outputFile);
    SnappyCodec snappy = new SnappyCodec();
    snappy.setConf(conf);
    outputStream = snappy.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();
    //.lz4
    input = new FileInputStream(inputFile);
    outputFile = pwd + "/data.csv.lz4";
    output = new FileOutputStream(outputFile);
    Lz4Codec lz4 = new Lz4Codec();
    lz4.setConf(conf);
    outputStream = lz4.createOutputStream(output);
    i = -1;
    while ((i = input.read()) != -1) {
        outputStream.write(i);
    }
    outputStream.close();
    input.close();
}
Also used : Lz4Codec(org.apache.hadoop.io.compress.Lz4Codec) CompressionOutputStream(org.apache.hadoop.io.compress.CompressionOutputStream) Configuration(org.apache.hadoop.conf.Configuration) FileOutputStream(java.io.FileOutputStream) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) BZip2Codec(org.apache.hadoop.io.compress.BZip2Codec) File(java.io.File) SnappyCodec(org.apache.hadoop.io.compress.SnappyCodec) FileInputStream(java.io.FileInputStream)

Example 5 with BZip2Codec

use of org.apache.hadoop.io.compress.BZip2Codec in project hadoop by apache.

the class TestConcatenatedCompressedInput method testBzip2.

/**
   * Test using the bzip2 codec for reading
   */
@Test
public void testBzip2() throws IOException {
    JobConf jobConf = new JobConf(defaultConf);
    CompressionCodec bzip2 = new BZip2Codec();
    ReflectionUtils.setConf(bzip2, jobConf);
    localFs.delete(workDir, true);
    System.out.println(COLOR_BR_CYAN + "testBzip2() using non-native CBZip2InputStream (presumably)" + COLOR_NORMAL);
    // copy prebuilt (correct!) version of concat.bz2 to HDFS
    final String fn = "concat" + bzip2.getDefaultExtension();
    Path fnLocal = new Path(System.getProperty("test.concat.data", "/tmp"), fn);
    Path fnHDFS = new Path(workDir, fn);
    localFs.copyFromLocalFile(fnLocal, fnHDFS);
    writeFile(localFs, new Path(workDir, "part2.txt.bz2"), bzip2, "this is a test\nof bzip2\n");
    FileInputFormat.setInputPaths(jobConf, workDir);
    // extends FileInputFormat
    TextInputFormat format = new TextInputFormat();
    format.configure(jobConf);
    // work around 2-byte splits issue
    format.setMinSplitSize(256);
    // [135 splits for a 208-byte file and a 62-byte file(!)]
    InputSplit[] splits = format.getSplits(jobConf, 100);
    assertEquals("compressed splits == 2", 2, splits.length);
    FileSplit tmp = (FileSplit) splits[0];
    if (tmp.getPath().getName().equals("part2.txt.bz2")) {
        splits[0] = splits[1];
        splits[1] = tmp;
    }
    List<Text> results = readSplit(format, splits[0], jobConf);
    assertEquals("splits[0] num lines", 6, results.size());
    assertEquals("splits[0][5]", "member #3", results.get(5).toString());
    results = readSplit(format, splits[1], jobConf);
    assertEquals("splits[1] num lines", 2, results.size());
    assertEquals("splits[1][0]", "this is a test", results.get(0).toString());
    assertEquals("splits[1][1]", "of bzip2", results.get(1).toString());
}
Also used : Path(org.apache.hadoop.fs.Path) BZip2Codec(org.apache.hadoop.io.compress.BZip2Codec) Text(org.apache.hadoop.io.Text) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Test(org.junit.Test)

Aggregations

BZip2Codec (org.apache.hadoop.io.compress.BZip2Codec)5 Path (org.apache.hadoop.fs.Path)4 Test (org.junit.Test)4 File (java.io.File)3 Configuration (org.apache.hadoop.conf.Configuration)3 FileInputStream (java.io.FileInputStream)2 URL (java.net.URL)2 HashSet (java.util.HashSet)2 Text (org.apache.hadoop.io.Text)2 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)2 Decompressor (org.apache.hadoop.io.compress.Decompressor)2 FileOutputStream (java.io.FileOutputStream)1 LongWritable (org.apache.hadoop.io.LongWritable)1 CompressionOutputStream (org.apache.hadoop.io.compress.CompressionOutputStream)1 GzipCodec (org.apache.hadoop.io.compress.GzipCodec)1 Lz4Codec (org.apache.hadoop.io.compress.Lz4Codec)1 SnappyCodec (org.apache.hadoop.io.compress.SnappyCodec)1 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)1 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)1 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)1