use of org.apache.hadoop.mapred.RecordReader in project hive by apache.
the class VectorizedOrcInputFormat method getRecordReader.
@Override
public RecordReader<NullWritable, VectorizedRowBatch> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException {
FileSplit fSplit = (FileSplit) inputSplit;
reporter.setStatus(fSplit.toString());
Path path = fSplit.getPath();
OrcFile.ReaderOptions opts = OrcFile.readerOptions(conf);
if (fSplit instanceof OrcSplit) {
OrcSplit orcSplit = (OrcSplit) fSplit;
if (orcSplit.hasFooter()) {
opts.orcTail(orcSplit.getOrcTail());
}
opts.maxLength(orcSplit.getFileLength());
}
Reader reader = OrcFile.createReader(path, opts);
return new VectorizedOrcRecordReader(reader, conf, fSplit);
}
use of org.apache.hadoop.mapred.RecordReader in project hive by apache.
the class SerDeEncodedDataReader method startReadSplitFromFile.
public void startReadSplitFromFile(FileSplit split, boolean[] splitIncludes, StripeData slice) throws IOException {
boolean maySplitTheSplit = slice == null;
ReaderWithOffsets offsetReader = null;
@SuppressWarnings("rawtypes") RecordReader sourceReader = sourceInputFormat.getRecordReader(split, jobConf, reporter);
try {
offsetReader = createOffsetReader(sourceReader);
sourceReader = null;
} finally {
if (sourceReader != null) {
try {
sourceReader.close();
} catch (Exception ex) {
LlapIoImpl.LOG.error("Failed to close source reader", ex);
}
}
}
maySplitTheSplit = maySplitTheSplit && offsetReader.hasOffsets();
try {
StructObjectInspector originalOi = (StructObjectInspector) getOiFromSerDe();
List<Integer> splitColumnIds = OrcInputFormat.genIncludedColumnsReverse(schema, splitIncludes, false);
// fileread writes to the writer, which writes to orcWriter, which writes to cacheWriter
EncodingWriter writer = VectorDeserializeOrcWriter.create(sourceInputFormat, sourceSerDe, parts, daemonConf, jobConf, split.getPath(), originalOi, splitColumnIds, splitIncludes, allocSize);
// TODO: move this into ctor? EW would need to create CacheWriter then
List<Integer> cwColIds = writer.isOnlyWritingIncludedColumns() ? splitColumnIds : columnIds;
writer.init(new CacheWriter(bufferManager, cwColIds, splitIncludes, writer.isOnlyWritingIncludedColumns()), daemonConf, split.getPath());
if (writer instanceof VectorDeserializeOrcWriter) {
VectorDeserializeOrcWriter asyncWriter = (VectorDeserializeOrcWriter) writer;
asyncWriter.startAsync(new AsyncCacheDataCallback());
this.asyncWriters.add(asyncWriter);
}
currentFileRead = new FileReaderYieldReturn(offsetReader, split, writer, maySplitTheSplit, targetSliceRowCount);
} finally {
// Assignment is the last thing in the try, so if it happen we assume success.
if (currentFileRead != null)
return;
if (offsetReader == null)
return;
try {
offsetReader.close();
} catch (Exception ex) {
LlapIoImpl.LOG.error("Failed to close source reader", ex);
}
}
}
use of org.apache.hadoop.mapred.RecordReader in project hadoop by apache.
the class TestPipeApplication method testApplication.
/**
* test org.apache.hadoop.mapred.pipes.Application
* test a internal functions: MessageType.REGISTER_COUNTER, INCREMENT_COUNTER, STATUS, PROGRESS...
*
* @throws Throwable
*/
@Test
public void testApplication() throws Throwable {
JobConf conf = new JobConf();
RecordReader<FloatWritable, NullWritable> rReader = new Reader();
// client for test
File fCommand = getFileCommand("org.apache.hadoop.mapred.pipes.PipeApplicationStub");
TestTaskReporter reporter = new TestTaskReporter();
File[] psw = cleanTokenPasswordFile();
try {
conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskName);
conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath());
// token for authorization
Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>("user".getBytes(), "password".getBytes(), new Text("kind"), new Text("service"));
TokenCache.setJobToken(token, conf.getCredentials());
FakeCollector output = new FakeCollector(new Counters.Counter(), new Progress());
FileSystem fs = new RawLocalFileSystem();
fs.initialize(FsConstants.LOCAL_FS_URI, conf);
Writer<IntWritable, Text> wr = new Writer<IntWritable, Text>(conf, fs.create(new Path(workSpace.getAbsolutePath() + File.separator + "outfile")), IntWritable.class, Text.class, null, null, true);
output.setWriter(wr);
conf.set(Submitter.PRESERVE_COMMANDFILE, "true");
initStdOut(conf);
Application<WritableComparable<IntWritable>, Writable, IntWritable, Text> application = new Application<WritableComparable<IntWritable>, Writable, IntWritable, Text>(conf, rReader, output, reporter, IntWritable.class, Text.class);
application.getDownlink().flush();
application.getDownlink().mapItem(new IntWritable(3), new Text("txt"));
application.getDownlink().flush();
application.waitForFinish();
wr.close();
// test getDownlink().mapItem();
String stdOut = readStdOut(conf);
assertTrue(stdOut.contains("key:3"));
assertTrue(stdOut.contains("value:txt"));
// reporter test counter, and status should be sended
// test MessageType.REGISTER_COUNTER and INCREMENT_COUNTER
assertEquals(1.0, reporter.getProgress(), 0.01);
assertNotNull(reporter.getCounter("group", "name"));
// test status MessageType.STATUS
assertEquals(reporter.getStatus(), "PROGRESS");
stdOut = readFile(new File(workSpace.getAbsolutePath() + File.separator + "outfile"));
// check MessageType.PROGRESS
assertEquals(0.55f, rReader.getProgress(), 0.001);
application.getDownlink().close();
// test MessageType.OUTPUT
Entry<IntWritable, Text> entry = output.getCollect().entrySet().iterator().next();
assertEquals(123, entry.getKey().get());
assertEquals("value", entry.getValue().toString());
try {
// try to abort
application.abort(new Throwable());
fail();
} catch (IOException e) {
// abort works ?
assertEquals("pipe child exception", e.getMessage());
}
} finally {
if (psw != null) {
// remove password files
for (File file : psw) {
file.deleteOnExit();
}
}
}
}
use of org.apache.hadoop.mapred.RecordReader in project hadoop by apache.
the class AutoInputFormat method getRecordReader.
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
FileSplit fileSplit = (FileSplit) split;
FileSystem fs = FileSystem.get(fileSplit.getPath().toUri(), job);
FSDataInputStream is = fs.open(fileSplit.getPath());
byte[] header = new byte[3];
RecordReader reader = null;
try {
is.readFully(header);
} catch (EOFException eof) {
reader = textInputFormat.getRecordReader(split, job, reporter);
} finally {
is.close();
}
if (header[0] == 'S' && header[1] == 'E' && header[2] == 'Q') {
reader = seqFileInputFormat.getRecordReader(split, job, reporter);
} else {
reader = textInputFormat.getRecordReader(split, job, reporter);
}
return reader;
}
use of org.apache.hadoop.mapred.RecordReader in project presto by prestodb.
the class HiveUtil method createRecordReader.
public static RecordReader<?, ?> createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns) {
// determine which hive columns we will read
List<HiveColumnHandle> readColumns = ImmutableList.copyOf(filter(columns, column -> column.getColumnType() == REGULAR));
List<Integer> readHiveColumnIndexes = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getHiveColumnIndex));
// Tell hive the columns we would like to read, this lets hive optimize reading column oriented files
setReadColumns(configuration, readHiveColumnIndexes);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, true);
JobConf jobConf = new JobConf(configuration);
FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null);
// propagate serialization configuration to getRecordReader
schema.stringPropertyNames().stream().filter(name -> name.startsWith("serialization.")).forEach(name -> jobConf.set(name, schema.getProperty(name)));
try {
return retry().stopOnIllegalExceptions().run("createRecordReader", () -> inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL));
} catch (Exception e) {
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", path, start, length, getInputFormatName(schema), e.getMessage()), e);
}
}
Aggregations