use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project jena by apache.
the class AbstractBlockBasedNodeTupleReader method initialize.
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
LOG.debug("initialize({}, {})", genericSplit, context);
// Assuming file split
if (!(genericSplit instanceof FileSplit))
throw new IOException("This record reader only supports FileSplit inputs");
FileSplit split = (FileSplit) genericSplit;
// Configuration
Configuration config = context.getConfiguration();
this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
if (this.ignoreBadTuples)
LOG.warn("Configured to ignore bad tuples, parsing errors will be logged and further parsing aborted but no user visible errors will be thrown. Consider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);
// Figure out what portion of the file to read
start = split.getStart();
long end = start + split.getLength();
final Path file = split.getPath();
long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
boolean readToEnd = end == totalLength;
CompressionCodecFactory factory = new CompressionCodecFactory(config);
this.compressionCodecs = factory.getCodec(file);
LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { start, split.getLength(), totalLength }));
// Open the file and prepare the input stream
FileSystem fs = file.getFileSystem(config);
FSDataInputStream fileIn = fs.open(file);
this.length = split.getLength();
if (start > 0)
fileIn.seek(start);
if (this.compressionCodecs != null) {
// Compressed input
// For compressed input NLineInputFormat will have failed to find
// any line breaks and will give us a split from 0 -> (length - 1)
// Add 1 and re-verify readToEnd so we can abort correctly if ever
// given a partial split of a compressed file
end++;
readToEnd = end == totalLength;
if (start > 0 || !readToEnd)
throw new IOException("This record reader can only be used with compressed input where the split is a whole file");
input = new TrackedInputStream(this.compressionCodecs.createInputStream(fileIn));
} else {
if (readToEnd) {
input = new TrackedInputStream(fileIn);
} else {
// Need to limit the portion of the file we are reading
input = new BlockInputStream(fileIn, split.getLength());
}
}
// Set up background thread for parser
iter = this.getPipedIterator();
this.stream = this.getPipedStream(iter, this.input);
RDFParserBuilder builder = RdfIOUtils.createRDFParserBuilder(context, file);
Runnable parserRunnable = this.createRunnable(this, this.input, stream, this.getRdfLanguage(), builder);
this.parserThread = new Thread(parserRunnable);
this.parserThread.setDaemon(true);
this.parserThread.start();
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project jena by apache.
the class AbstractLineBasedNodeTupleReader method initialize.
@Override
public final void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
LOG.debug("initialize({}, {})", genericSplit, context);
// Assuming file split
if (!(genericSplit instanceof FileSplit))
throw new IOException("This record reader only supports FileSplit inputs");
FileSplit split = (FileSplit) genericSplit;
// Intermediate : RDFParser but need to make a Iterator<Quad/Triple>
LabelToNode labelToNode = RdfIOUtils.createLabelToNode(context, split.getPath());
maker = new ParserProfileStd(RiotLib.factoryRDF(labelToNode), ErrorHandlerFactory.errorHandlerStd, IRIResolver.create(), PrefixMapFactory.createForInput(), null, true, false);
Configuration config = context.getConfiguration();
this.ignoreBadTuples = config.getBoolean(RdfIOConstants.INPUT_IGNORE_BAD_TUPLES, true);
if (this.ignoreBadTuples)
LOG.warn("Configured to ignore bad tuples, parsing errors will be logged and the bad line skipped but no errors will be thrownConsider setting {} to false to disable this behaviour", RdfIOConstants.INPUT_IGNORE_BAD_TUPLES);
// Figure out what portion of the file to read
this.maxLineLength = config.getInt(HadoopIOConstants.MAX_LINE_LENGTH, Integer.MAX_VALUE);
start = split.getStart();
end = start + split.getLength();
final Path file = split.getPath();
long totalLength = file.getFileSystem(context.getConfiguration()).getFileStatus(file).getLen();
compressionCodecs = new CompressionCodecFactory(config);
final CompressionCodec codec = compressionCodecs.getCodec(file);
LOG.info(String.format("Got split with start %d and length %d for file with total length of %d", new Object[] { start, split.getLength(), totalLength }));
// Open the file and seek to the start of the split
FileSystem fs = file.getFileSystem(config);
FSDataInputStream fileIn = fs.open(file);
boolean skipFirstLine = false;
if (codec != null) {
// Add 1 and verify we got complete split
if (totalLength > split.getLength() + 1)
throw new IOException("This record reader can only be used with compressed input where the split covers the whole file");
in = new LineReader(codec.createInputStream(fileIn), config);
estLength = end;
end = Long.MAX_VALUE;
} else {
// Uncompressed input
if (start != 0) {
skipFirstLine = true;
--start;
fileIn.seek(start);
}
in = new LineReader(fileIn, config);
}
// NLineInputFormat will provide the split information to use
if (skipFirstLine) {
start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
}
this.pos = start;
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hive by apache.
the class RCFileMapReduceRecordReader method initialize.
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit fSplit = (FileSplit) split;
Path path = fSplit.getPath();
Configuration conf = context.getConfiguration();
this.in = new RCFile.Reader(path.getFileSystem(conf), path, conf);
this.end = fSplit.getStart() + fSplit.getLength();
if (fSplit.getStart() > in.getPosition()) {
in.sync(fSplit.getStart());
}
this.start = in.getPosition();
more = start < end;
key = new LongWritable();
value = new BytesRefArrayWritable();
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project hive by apache.
the class OrcNewInputFormat method createRecordReader.
@Override
public RecordReader<NullWritable, OrcStruct> createRecordReader(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit fileSplit = (FileSplit) inputSplit;
Path path = fileSplit.getPath();
Configuration conf = ShimLoader.getHadoopShims().getConfiguration(context);
return new OrcRecordReader(OrcFile.createReader(path, OrcFile.readerOptions(conf)), ShimLoader.getHadoopShims().getConfiguration(context), fileSplit.getStart(), fileSplit.getLength());
}
use of org.apache.hadoop.mapreduce.lib.input.FileSplit in project crunch by cloudera.
the class AvroRecordReader method initialize.
@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException {
FileSplit split = (FileSplit) genericSplit;
Configuration conf = context.getConfiguration();
SeekableInput in = new FsInput(split.getPath(), conf);
DatumReader<T> datumReader = null;
if (context.getConfiguration().getBoolean(AvroJob.INPUT_IS_REFLECT, true)) {
ReflectDataFactory factory = Avros.getReflectDataFactory(conf);
datumReader = factory.getReader(schema);
} else {
datumReader = new SpecificDatumReader<T>(schema);
}
this.reader = DataFileReader.openReader(in, datumReader);
// sync to start
reader.sync(split.getStart());
this.start = reader.tell();
this.end = split.getStart() + split.getLength();
}
Aggregations