use of org.apache.hadoop.mapred.RecordReader in project hive by apache.
the class HiveInputFormat method getRecordReader.
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
HiveInputSplit hsplit = (HiveInputSplit) split;
InputSplit inputSplit = hsplit.getInputSplit();
String inputFormatClassName = null;
Class inputFormatClass = null;
try {
inputFormatClassName = hsplit.inputFormatClassName();
inputFormatClass = job.getClassByName(inputFormatClassName);
} catch (Exception e) {
throw new IOException("cannot find class " + inputFormatClassName, e);
}
if (this.mrwork == null || pathToPartitionInfo == null) {
init(job);
}
boolean nonNative = false;
PartitionDesc part = HiveFileFormatUtils.getPartitionDescFromPathRecursively(pathToPartitionInfo, hsplit.getPath(), null);
if (LOG.isDebugEnabled()) {
LOG.debug("Found spec for " + hsplit.getPath() + " " + part + " from " + pathToPartitionInfo);
}
if ((part != null) && (part.getTableDesc() != null)) {
Utilities.copyTableJobPropertiesToConf(part.getTableDesc(), job);
nonNative = part.getTableDesc().isNonNative();
}
Path splitPath = hsplit.getPath();
pushProjectionsAndFilters(job, inputFormatClass, splitPath, nonNative);
InputFormat inputFormat = getInputFormatFromCache(inputFormatClass, job);
try {
inputFormat = HiveInputFormat.wrapForLlap(inputFormat, job, part);
} catch (HiveException e) {
throw new IOException(e);
}
RecordReader innerReader = null;
try {
innerReader = inputFormat.getRecordReader(inputSplit, job, reporter);
} catch (Exception e) {
innerReader = HiveIOExceptionHandlerUtil.handleRecordReaderCreationException(e, job);
}
HiveRecordReader<K, V> rr = new HiveRecordReader(innerReader, job);
rr.initIOContext(hsplit, job, inputFormatClass, innerReader);
return rr;
}
use of org.apache.hadoop.mapred.RecordReader in project trevni by cutting.
the class AvroTrevniInputFormat method getRecordReader.
@Override
public RecordReader<AvroWrapper<T>, NullWritable> getRecordReader(InputSplit split, final JobConf job, Reporter reporter) throws IOException {
final FileSplit file = (FileSplit) split;
reporter.setStatus(file.toString());
final AvroColumnReader.Params params = new AvroColumnReader.Params(new HadoopInput(file.getPath(), job));
params.setModel(ReflectData.get());
if (job.get(AvroJob.INPUT_SCHEMA) != null)
params.setSchema(AvroJob.getInputSchema(job));
return new RecordReader<AvroWrapper<T>, NullWritable>() {
private AvroColumnReader<T> reader = new AvroColumnReader<T>(params);
private float rows = reader.getRowCount();
private long row;
public AvroWrapper<T> createKey() {
return new AvroWrapper<T>(null);
}
public NullWritable createValue() {
return NullWritable.get();
}
public boolean next(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException {
if (!reader.hasNext())
return false;
wrapper.datum(reader.next());
row++;
return true;
}
public float getProgress() throws IOException {
return row / rows;
}
public long getPos() throws IOException {
return row;
}
public void close() throws IOException {
reader.close();
}
};
}
use of org.apache.hadoop.mapred.RecordReader in project asterixdb by apache.
the class HDFSReadOperatorDescriptor method createPushRuntime.
@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException {
final InputSplit[] inputSplits = splitsFactory.getSplits();
return new AbstractUnaryOutputSourceOperatorNodePushable() {
private String nodeName = ctx.getJobletContext().getServiceContext().getNodeId();
@SuppressWarnings("unchecked")
@Override
public void initialize() throws HyracksDataException {
ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
try {
writer.open();
Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
JobConf conf = confFactory.getConf();
conf.setClassLoader(ctx.getJobletContext().getClassLoader());
IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
try {
parser.open(writer);
InputFormat inputFormat = conf.getInputFormat();
for (int i = 0; i < inputSplits.length; i++) {
/**
* read all the partitions scheduled to the current node
*/
if (scheduledLocations[i].equals(nodeName)) {
/**
* pick an unread split to read
* synchronize among simultaneous partitions in the same machine
*/
synchronized (executed) {
if (executed[i] == false) {
executed[i] = true;
} else {
continue;
}
}
/**
* read the split
*/
RecordReader reader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);
Object key = reader.createKey();
Object value = reader.createValue();
while (reader.next(key, value) == true) {
parser.parse(key, value, writer, inputSplits[i].toString());
}
}
}
} finally {
parser.close(writer);
}
} catch (Throwable th) {
writer.fail();
throw new HyracksDataException(th);
} finally {
writer.close();
Thread.currentThread().setContextClassLoader(ctxCL);
}
}
};
}
use of org.apache.hadoop.mapred.RecordReader in project hbase by apache.
the class TableInputFormatBase method getRecordReader.
/**
* Builds a TableRecordReader. If no TableRecordReader was provided, uses
* the default.
*
* @see org.apache.hadoop.mapred.InputFormat#getRecordReader(InputSplit,
* JobConf, Reporter)
*/
public RecordReader<ImmutableBytesWritable, Result> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException {
// In case a subclass uses the deprecated approach or calls initializeTable directly
if (table == null) {
initialize(job);
}
// null check in case our child overrides getTable to not throw.
try {
if (getTable() == null) {
// initialize() must not have been implemented in the subclass.
throw new IOException(INITIALIZATION_ERROR);
}
} catch (IllegalStateException exception) {
throw new IOException(INITIALIZATION_ERROR, exception);
}
TableSplit tSplit = (TableSplit) split;
// if no table record reader was provided use default
final TableRecordReader trr = this.tableRecordReader == null ? new TableRecordReader() : this.tableRecordReader;
trr.setStartRow(tSplit.getStartRow());
trr.setEndRow(tSplit.getEndRow());
trr.setHTable(this.table);
trr.setInputColumns(this.inputColumns);
trr.setRowFilter(this.rowFilter);
trr.init();
return new RecordReader<ImmutableBytesWritable, Result>() {
@Override
public void close() throws IOException {
trr.close();
closeTable();
}
@Override
public ImmutableBytesWritable createKey() {
return trr.createKey();
}
@Override
public Result createValue() {
return trr.createValue();
}
@Override
public long getPos() throws IOException {
return trr.getPos();
}
@Override
public float getProgress() throws IOException {
return trr.getProgress();
}
@Override
public boolean next(ImmutableBytesWritable key, Result value) throws IOException {
return trr.next(key, value);
}
};
}
use of org.apache.hadoop.mapred.RecordReader in project hive by apache.
the class TestRCFile method writeThenReadByRecordReader.
private void writeThenReadByRecordReader(int intervalRecordCount, int writeCount, int splitNumber, long minSplitSize, CompressionCodec codec) throws IOException {
Path testDir = new Path(System.getProperty("test.tmp.dir", ".") + "/mapred/testsmallfirstsplit");
Path testFile = new Path(testDir, "test_rcfile");
fs.delete(testFile, true);
Configuration cloneConf = new Configuration(conf);
RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount);
RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec);
BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length);
for (int i = 0; i < bytesArray.length; i++) {
BytesRefWritable cu = null;
cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length);
bytes.set(i, cu);
}
for (int i = 0; i < writeCount; i++) {
if (i == intervalRecordCount) {
System.out.println("write position:" + writer.getLength());
}
writer.append(bytes);
}
writer.close();
RCFileInputFormat inputFormat = new RCFileInputFormat();
JobConf jonconf = new JobConf(cloneConf);
jonconf.set("mapred.input.dir", testDir.toString());
HiveConf.setLongVar(jonconf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, minSplitSize);
InputSplit[] splits = inputFormat.getSplits(jonconf, splitNumber);
assertEquals("splits length should be " + splitNumber, splits.length, splitNumber);
int readCount = 0;
for (int i = 0; i < splits.length; i++) {
int previousReadCount = readCount;
RecordReader rr = inputFormat.getRecordReader(splits[i], jonconf, Reporter.NULL);
Object key = rr.createKey();
Object value = rr.createValue();
while (rr.next(key, value)) {
readCount++;
}
rr.close();
System.out.println("The " + i + "th split read " + (readCount - previousReadCount));
}
assertEquals("readCount should be equal to writeCount", writeCount, readCount);
}
Aggregations