use of org.apache.orc.RecordReader in project flink by apache.
the class OrcNoHiveShim method createRecordReader.
@Override
public RecordReader createRecordReader(Configuration conf, TypeDescription schema, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, org.apache.flink.core.fs.Path path, long splitStart, long splitLength) throws IOException {
// open ORC file and create reader
org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(path.toUri());
Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));
// get offset and length for the stripes that start in the split
Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(splitStart, splitLength, orcReader.getStripes());
// create ORC row reader configuration
Reader.Options options = new Reader.Options().schema(schema).range(offsetAndLength.f0, offsetAndLength.f1).useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf)).skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf)).tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));
// TODO configure filters
// configure selected fields
options.include(computeProjectionMask(schema, selectedFields));
// create ORC row reader
RecordReader orcRowsReader = orcReader.rows(options);
// assign ids
schema.getId();
return orcRowsReader;
}
use of org.apache.orc.RecordReader in project flink by apache.
the class OrcShimV200 method nextBatch.
@Override
public boolean nextBatch(RecordReader reader, VectorizedRowBatch rowBatch) throws IOException {
try {
if (hasNextMethod == null) {
hasNextMethod = Class.forName("org.apache.hadoop.hive.ql.io.orc.RecordReader").getMethod("hasNext");
hasNextMethod.setAccessible(true);
}
if (nextBatchMethod == null) {
nextBatchMethod = RecordReader.class.getMethod("nextBatch", VectorizedRowBatch.class);
nextBatchMethod.setAccessible(true);
}
boolean hasNext = (boolean) hasNextMethod.invoke(reader);
if (hasNext) {
nextBatchMethod.invoke(reader, rowBatch);
return true;
} else {
return false;
}
} catch (IllegalAccessException | InvocationTargetException | NoSuchMethodException | ClassNotFoundException e) {
throw new IOException(e);
}
}
use of org.apache.orc.RecordReader in project flink by apache.
the class AbstractOrcFileInputFormat method createReader.
// ------------------------------------------------------------------------
@Override
public OrcVectorizedReader<T, BatchT> createReader(final Configuration config, final SplitT split) throws IOException {
final int numBatchesToCirculate = config.getInteger(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY);
final Pool<OrcReaderBatch<T, BatchT>> poolOfBatches = createPoolOfBatches(split, numBatchesToCirculate);
final RecordReader orcReader = shim.createRecordReader(hadoopConfigWrapper.getHadoopConfig(), schema, selectedFields, conjunctPredicates, split.path(), split.offset(), split.length());
return new OrcVectorizedReader<>(shim, orcReader, poolOfBatches);
}
use of org.apache.orc.RecordReader in project flink by apache.
the class OrcShimV200 method createRecordReader.
@Override
public RecordReader createRecordReader(Configuration conf, TypeDescription schema, int[] selectedFields, List<Predicate> conjunctPredicates, org.apache.flink.core.fs.Path path, long splitStart, long splitLength) throws IOException {
// open ORC file and create reader
Path hPath = new Path(path.toUri());
Reader orcReader = createReader(hPath, conf);
// get offset and length for the stripes that start in the split
Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(splitStart, splitLength, orcReader.getStripes());
// create ORC row reader configuration
Reader.Options options = readOrcConf(new Reader.Options().schema(schema).range(offsetAndLength.f0, offsetAndLength.f1), conf);
// configure filters
if (!conjunctPredicates.isEmpty()) {
SearchArgument.Builder b = SearchArgumentFactory.newBuilder();
b = b.startAnd();
for (Predicate predicate : conjunctPredicates) {
predicate.add(b);
}
b = b.end();
options.searchArgument(b.build(), new String[] {});
}
// configure selected fields
options.include(computeProjectionMask(schema, selectedFields));
// create ORC row reader
RecordReader orcRowsReader = createRecordReader(orcReader, options);
// assign ids
schema.getId();
return orcRowsReader;
}
use of org.apache.orc.RecordReader in project flink by apache.
the class OrcBulkRowDataWriterTest method getResults.
private static List<RowData> getResults(Reader reader) throws IOException {
List<RowData> results = new ArrayList<>();
RecordReader recordReader = reader.rows();
VectorizedRowBatch batch = reader.getSchema().createRowBatch();
while (recordReader.nextBatch(batch)) {
BytesColumnVector stringVector = (BytesColumnVector) batch.cols[0];
LongColumnVector intVector = (LongColumnVector) batch.cols[1];
ListColumnVector listVector = (ListColumnVector) batch.cols[2];
MapColumnVector mapVector = (MapColumnVector) batch.cols[3];
for (int r = 0; r < batch.size; r++) {
GenericRowData readRowData = new GenericRowData(4);
readRowData.setField(0, readStringData(stringVector, r));
readRowData.setField(1, readInt(intVector, r));
readRowData.setField(2, readList(listVector, r));
readRowData.setField(3, readMap(mapVector, r));
results.add(readRowData);
}
recordReader.close();
}
return results;
}
Aggregations