Search in sources :

Example 1 with SplitCompressionInputStream

use of org.apache.hadoop.io.compress.SplitCompressionInputStream in project hadoop by apache.

the class LineRecordReader method initialize.

public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (start != 0) {
                // a Compression codec that cannot be split.
                throw new IOException("Cannot seek in " + codec.getClass().getSimpleName() + " compressed stream");
            }
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new UncompressedSplitLineReader(fileIn, job, this.recordDelimiterBytes, split.getLength());
        filePosition = fileIn;
    }
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}
Also used : Path(org.apache.hadoop.fs.Path) SplittableCompressionCodec(org.apache.hadoop.io.compress.SplittableCompressionCodec) Configuration(org.apache.hadoop.conf.Configuration) SplitCompressionInputStream(org.apache.hadoop.io.compress.SplitCompressionInputStream) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) SplittableCompressionCodec(org.apache.hadoop.io.compress.SplittableCompressionCodec)

Example 2 with SplitCompressionInputStream

use of org.apache.hadoop.io.compress.SplitCompressionInputStream in project shifu by ShifuML.

the class CombineRecordReader method initializeOne.

public void initializeOne(FileSplit split, TaskAttemptContext context) throws IOException {
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);
    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, job, this.recordDelimiterBytes);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fileIn, decompressor), job, this.recordDelimiterBytes);
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
        filePosition = fileIn;
    }
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    }
    this.pos = start;
}
Also used : Path(org.apache.hadoop.fs.Path) SplitLineReader(org.apache.hadoop.mapreduce.lib.input.SplitLineReader) CompressedSplitLineReader(org.apache.hadoop.mapreduce.lib.input.CompressedSplitLineReader) SplittableCompressionCodec(org.apache.hadoop.io.compress.SplittableCompressionCodec) Configuration(org.apache.hadoop.conf.Configuration) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) SplitCompressionInputStream(org.apache.hadoop.io.compress.SplitCompressionInputStream) FileSystem(org.apache.hadoop.fs.FileSystem) CompressedSplitLineReader(org.apache.hadoop.mapreduce.lib.input.CompressedSplitLineReader) Text(org.apache.hadoop.io.Text) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) SplittableCompressionCodec(org.apache.hadoop.io.compress.SplittableCompressionCodec)

Aggregations

Configuration (org.apache.hadoop.conf.Configuration)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 Text (org.apache.hadoop.io.Text)2 CompressionCodec (org.apache.hadoop.io.compress.CompressionCodec)2 CompressionCodecFactory (org.apache.hadoop.io.compress.CompressionCodecFactory)2 SplitCompressionInputStream (org.apache.hadoop.io.compress.SplitCompressionInputStream)2 SplittableCompressionCodec (org.apache.hadoop.io.compress.SplittableCompressionCodec)2 IOException (java.io.IOException)1 CompressedSplitLineReader (org.apache.hadoop.mapreduce.lib.input.CompressedSplitLineReader)1 SplitLineReader (org.apache.hadoop.mapreduce.lib.input.SplitLineReader)1