Search in sources :

Example 1 with RecordReader

use of org.apache.orc.RecordReader in project flink by apache.

the class OrcNoHiveShim method createRecordReader.

@Override
public RecordReader createRecordReader(Configuration conf, TypeDescription schema, int[] selectedFields, List<OrcFilters.Predicate> conjunctPredicates, org.apache.flink.core.fs.Path path, long splitStart, long splitLength) throws IOException {
    // open ORC file and create reader
    org.apache.hadoop.fs.Path hPath = new org.apache.hadoop.fs.Path(path.toUri());
    Reader orcReader = OrcFile.createReader(hPath, OrcFile.readerOptions(conf));
    // get offset and length for the stripes that start in the split
    Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(splitStart, splitLength, orcReader.getStripes());
    // create ORC row reader configuration
    Reader.Options options = new Reader.Options().schema(schema).range(offsetAndLength.f0, offsetAndLength.f1).useZeroCopy(OrcConf.USE_ZEROCOPY.getBoolean(conf)).skipCorruptRecords(OrcConf.SKIP_CORRUPT_DATA.getBoolean(conf)).tolerateMissingSchema(OrcConf.TOLERATE_MISSING_SCHEMA.getBoolean(conf));
    // TODO configure filters
    // configure selected fields
    options.include(computeProjectionMask(schema, selectedFields));
    // create ORC row reader
    RecordReader orcRowsReader = orcReader.rows(options);
    // assign ids
    schema.getId();
    return orcRowsReader;
}
Also used : RecordReader(org.apache.orc.RecordReader) RecordReader(org.apache.orc.RecordReader) Reader(org.apache.orc.Reader)

Example 2 with RecordReader

use of org.apache.orc.RecordReader in project flink by apache.

the class OrcShimV200 method nextBatch.

@Override
public boolean nextBatch(RecordReader reader, VectorizedRowBatch rowBatch) throws IOException {
    try {
        if (hasNextMethod == null) {
            hasNextMethod = Class.forName("org.apache.hadoop.hive.ql.io.orc.RecordReader").getMethod("hasNext");
            hasNextMethod.setAccessible(true);
        }
        if (nextBatchMethod == null) {
            nextBatchMethod = RecordReader.class.getMethod("nextBatch", VectorizedRowBatch.class);
            nextBatchMethod.setAccessible(true);
        }
        boolean hasNext = (boolean) hasNextMethod.invoke(reader);
        if (hasNext) {
            nextBatchMethod.invoke(reader, rowBatch);
            return true;
        } else {
            return false;
        }
    } catch (IllegalAccessException | InvocationTargetException | NoSuchMethodException | ClassNotFoundException e) {
        throw new IOException(e);
    }
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) RecordReader(org.apache.orc.RecordReader) IOException(java.io.IOException) InvocationTargetException(java.lang.reflect.InvocationTargetException)

Example 3 with RecordReader

use of org.apache.orc.RecordReader in project flink by apache.

the class AbstractOrcFileInputFormat method createReader.

// ------------------------------------------------------------------------
@Override
public OrcVectorizedReader<T, BatchT> createReader(final Configuration config, final SplitT split) throws IOException {
    final int numBatchesToCirculate = config.getInteger(SourceReaderOptions.ELEMENT_QUEUE_CAPACITY);
    final Pool<OrcReaderBatch<T, BatchT>> poolOfBatches = createPoolOfBatches(split, numBatchesToCirculate);
    final RecordReader orcReader = shim.createRecordReader(hadoopConfigWrapper.getHadoopConfig(), schema, selectedFields, conjunctPredicates, split.path(), split.offset(), split.length());
    return new OrcVectorizedReader<>(shim, orcReader, poolOfBatches);
}
Also used : RecordReader(org.apache.orc.RecordReader)

Example 4 with RecordReader

use of org.apache.orc.RecordReader in project flink by apache.

the class OrcShimV200 method createRecordReader.

@Override
public RecordReader createRecordReader(Configuration conf, TypeDescription schema, int[] selectedFields, List<Predicate> conjunctPredicates, org.apache.flink.core.fs.Path path, long splitStart, long splitLength) throws IOException {
    // open ORC file and create reader
    Path hPath = new Path(path.toUri());
    Reader orcReader = createReader(hPath, conf);
    // get offset and length for the stripes that start in the split
    Tuple2<Long, Long> offsetAndLength = getOffsetAndLengthForSplit(splitStart, splitLength, orcReader.getStripes());
    // create ORC row reader configuration
    Reader.Options options = readOrcConf(new Reader.Options().schema(schema).range(offsetAndLength.f0, offsetAndLength.f1), conf);
    // configure filters
    if (!conjunctPredicates.isEmpty()) {
        SearchArgument.Builder b = SearchArgumentFactory.newBuilder();
        b = b.startAnd();
        for (Predicate predicate : conjunctPredicates) {
            predicate.add(b);
        }
        b = b.end();
        options.searchArgument(b.build(), new String[] {});
    }
    // configure selected fields
    options.include(computeProjectionMask(schema, selectedFields));
    // create ORC row reader
    RecordReader orcRowsReader = createRecordReader(orcReader, options);
    // assign ids
    schema.getId();
    return orcRowsReader;
}
Also used : Path(org.apache.hadoop.fs.Path) RecordReader(org.apache.orc.RecordReader) RecordReader(org.apache.orc.RecordReader) Reader(org.apache.orc.Reader) SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) Predicate(org.apache.flink.orc.OrcFilters.Predicate)

Example 5 with RecordReader

use of org.apache.orc.RecordReader in project flink by apache.

the class OrcBulkRowDataWriterTest method getResults.

private static List<RowData> getResults(Reader reader) throws IOException {
    List<RowData> results = new ArrayList<>();
    RecordReader recordReader = reader.rows();
    VectorizedRowBatch batch = reader.getSchema().createRowBatch();
    while (recordReader.nextBatch(batch)) {
        BytesColumnVector stringVector = (BytesColumnVector) batch.cols[0];
        LongColumnVector intVector = (LongColumnVector) batch.cols[1];
        ListColumnVector listVector = (ListColumnVector) batch.cols[2];
        MapColumnVector mapVector = (MapColumnVector) batch.cols[3];
        for (int r = 0; r < batch.size; r++) {
            GenericRowData readRowData = new GenericRowData(4);
            readRowData.setField(0, readStringData(stringVector, r));
            readRowData.setField(1, readInt(intVector, r));
            readRowData.setField(2, readList(listVector, r));
            readRowData.setField(3, readMap(mapVector, r));
            results.add(readRowData);
        }
        recordReader.close();
    }
    return results;
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) GenericRowData(org.apache.flink.table.data.GenericRowData) RowData(org.apache.flink.table.data.RowData) ListColumnVector(org.apache.hadoop.hive.ql.exec.vector.ListColumnVector) MapColumnVector(org.apache.hadoop.hive.ql.exec.vector.MapColumnVector) RecordReader(org.apache.orc.RecordReader) ArrayList(java.util.ArrayList) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) GenericRowData(org.apache.flink.table.data.GenericRowData) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Aggregations

RecordReader (org.apache.orc.RecordReader)8 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)4 Reader (org.apache.orc.Reader)4 Path (org.apache.hadoop.fs.Path)3 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)3 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)2 TypeDescription (org.apache.orc.TypeDescription)2 InvocationTargetException (java.lang.reflect.InvocationTargetException)1 NoSuchElementException (java.util.NoSuchElementException)1 CleanableFile (org.apache.druid.data.input.InputEntity.CleanableFile)1 IntermediateRowParsingReader (org.apache.druid.data.input.IntermediateRowParsingReader)1 Closer (org.apache.druid.java.util.common.io.Closer)1 CloseableIterator (org.apache.druid.java.util.common.parsers.CloseableIterator)1 Predicate (org.apache.flink.orc.OrcFilters.Predicate)1 Record (org.apache.flink.orc.data.Record)1 GenericRowData (org.apache.flink.table.data.GenericRowData)1 RowData (org.apache.flink.table.data.RowData)1 ListColumnVector (org.apache.hadoop.hive.ql.exec.vector.ListColumnVector)1