Examples with FileSplit - org.apache.hadoop.mapreduce.lib.input.FileSplit

Example 16 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project jena by apache.

the class AbstractBlockBasedNodeTupleReader method initialize.

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    LOG.debug("initialize({}, {})", genericSplit, context);
    // Assuming file split
    if (!(genericSplit instanceof FileSplit))
        throw new IOException("This record reader only supports FileSplit inputs");
    FileSplit split = (FileSplit) genericSplit;
    // Configuration
    Configuration config = context.getConfiguration();
    this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
    if (this.ignoreBadTuples)
        LOG.warn("Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown.  Consider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);
    // Figure out what portion of the file to read
    start = split.getStart();
    long end = start + split.getLength();
    final Path file = split.getPath();
    long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
    boolean readToEnd = end == totalLength;
    CompressionCodecFactory factory = new CompressionCodecFactory(config);
    this.compressionCodecs = factory.getCodec(file);
    LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { start, split.getLength(), totalLength }));
    // Open the file and prepare the input stream
    FileSystem fs = file.getFileSystem(config);
    FSDataInputStream fileIn = fs.open(file);
    this.length = split.getLength();
    if (start > 0)
        fileIn.seek(start);
    if (this.compressionCodecs != null) {
        // Compressed input
        // For compressed input NLineInputFormat will have failed to find
        // any line breaks and will give us a split from 0 -> (length - 1)
        // Add 1 and re-verify readToEnd so we can abort correctly if ever
        // given a partial split of a compressed file
        end++;
        readToEnd = end == totalLength;
        if (start > 0 || !readToEnd)
            throw new IOException("This record reader can only be used with compressed input where the split is a whole file");
        input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn));
    } else {
        if (readToEnd) {
            input = new TrackedInputStream(fileIn);
        } else {
            // Need to limit the portion of the file we are reading
            input = new BlockInputStream(fileIn, split.getLength());
        }
    }
    // Set up background thread for parser
    iter = this.getPipedIterator();
    this.stream = this.getPipedStream(iter, this.input);
    RDFParserBuilder builder = RdfIOUtils.createRDFParserBuilder(context, file);
    Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), builder);
    this.parserThread = new Thread(parserRunnable);
    this.parserThread.setDaemon(true);
    this.parserThread.start();
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) RDFParserBuilder(org.apache.jena.riot.RDFParserBuilder) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Example 17 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project jena by apache.

the class AbstractLineBasedNodeTupleReader method initialize.

@Override
public final void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    LOG.debug("initialize({}, {})", genericSplit, context);
    // Assuming file split
    if (!(genericSplit instanceof FileSplit))
        throw new IOException("This record reader only supports FileSplit inputs");
    FileSplit split = (FileSplit) genericSplit;
    // Intermediate : RDFParser but need to make a Iterator<Quad/Triple>
    LabelToNode labelToNode = RdfIOUtils.createLabelToNode(context, split.getPath());
    maker = new ParserProfileStd(RiotLib.factoryRDF(labelToNode), ErrorHandlerFactory.errorHandlerStd, IRIResolver.create(), PrefixMapFactory.createForInput(), null, true, false);
    Configuration config = context.getConfiguration();
    this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
    if (this.ignoreBadTuples)
        LOG.warn("Configured to ignore bad tuples, parsing errors will be logged and the bad line skipped but no errors will be thrownConsider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);
    // Figure out what portion of the file to read
    this.maxLineLength = config.getInt(HadoopIOConstants.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
    compressionCodecs = new CompressionCodecFactory(config);
    final CompressionCodec codec = compressionCodecs.getCodec(file);
    LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { start, split.getLength(), totalLength }));
    // Open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(config);
    FSDataInputStream fileIn = fs.open(file);
    boolean skipFirstLine = false;
    if (codec != null) {
        // Add 1 and verify we got complete split
        if (totalLength > split.getLength() + 1)
            throw new IOException("This record reader can only be used with compressed input where the split covers the whole file");
        in = new LineReader(codec.createInputStream(fileIn), config);
        estLength = end;
        end = Long.MAX_VALUE;
    } else {
        // Uncompressed input
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LineReader(fileIn, config);
    }
    // NLineInputFormat will provide the split information to use
    if (skipFirstLine) {
        start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) LabelToNode(org.apache.jena.riot.lang.LabelToNode) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) FileSystem(org.apache.hadoop.fs.FileSystem) LineReader(org.apache.hadoop.util.LineReader) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 18 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hive by apache.

the class RCFileMapReduceRecordReader method initialize.

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fSplit = (FileSplit) split;
    Path path = fSplit.getPath();
    Configuration conf = context.getConfiguration();
    this.in = new RCFile.Reader(path.getFileSystem(conf), path, conf);
    this.end = fSplit.getStart() + fSplit.getLength();
    if (fSplit.getStart() > in.getPosition()) {
        in.sync(fSplit.getStart());
    }
    this.start = in.getPosition();
    more = start < end;
    key = new LongWritable();
    value = new BytesRefArrayWritable();
}

Also used : Path(org.apache.hadoop.fs.Path) RCFile(org.apache.hadoop.hive.ql.io.RCFile) Configuration(org.apache.hadoop.conf.Configuration) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) Reader(org.apache.hadoop.hive.ql.io.RCFile.Reader) LongWritable(org.apache.hadoop.io.LongWritable) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit)

Example 19 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hive by apache.

the class OrcNewInputFormat method createRecordReader.

@Override
public RecordReader<NullWritable, OrcStruct> createRecordReader(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) inputSplit;
    Path path = fileSplit.getPath();
    Configuration conf = ShimLoader.getHadoopShims().getConfiguration(context);
    return new OrcRecordReader(OrcFile.createReader(path, OrcFile.readerOptions(conf)), ShimLoader.getHadoopShims().getConfiguration(context), fileSplit.getStart(), fileSplit.getLength());
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit)

Example 20 with FileSplit

use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project crunch by cloudera.

the class AvroRecordReader method initialize.

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration conf = context.getConfiguration();
    SeekableInput in = new FsInput(split.getPath(), conf);
    DatumReader<T> datumReader = null;
    if (context.getConfiguration().getBoolean(AvroJob.INPUT_IS_REFLECT, true)) {
        ReflectDataFactory factory = Avros.getReflectDataFactory(conf);
        datumReader = factory.getReader(schema);
    } else {
        datumReader = new SpecificDatumReader<T>(schema);
    }
    this.reader = DataFileReader.openReader(in, datumReader);
    // sync to start
    reader.sync(split.getStart());
    this.start = reader.tell();
    this.end = split.getStart() + split.getLength();
}

Also used : Configuration(org.apache.hadoop.conf.Configuration) FsInput(org.apache.avro.mapred.FsInput) SeekableInput(org.apache.avro.file.SeekableInput) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit)

Aggregations

FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)39 Path (org.apache.hadoop.fs.Path)22 Configuration (org.apache.hadoop.conf.Configuration)13 InputSplit (org.apache.hadoop.mapreduce.InputSplit)12 IOException (java.io.IOException)10 ArrayList (java.util.ArrayList)10 FileSystem (org.apache.hadoop.fs.FileSystem)7 BSONFileSplit (com.mongodb.hadoop.input.BSONFileSplit)4 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)4 Text (org.apache.hadoop.io.Text)3 NodeControllerInfo (org.apache.hyracks.api.client.NodeControllerInfo)3 BSONSplitter (com.mongodb.hadoop.splitter.BSONSplitter)2 ByteArrayInputStream (java.io.ByteArrayInputStream)2 File (java.io.File)2 Constructor (java.lang.reflect.Constructor)2 Schema (org.apache.avro.Schema)2 AvroKeyRecordReader (org.apache.avro.mapreduce.AvroKeyRecordReader)2 FileSplitPartitionQuery (org.apache.gora.query.impl.FileSplitPartitionQuery)2 FileStatus (org.apache.hadoop.fs.FileStatus)2