Search in sources :

Example 1 with OrcMapredRecordReader

use of org.apache.orc.mapred.OrcMapredRecordReader in project druid by druid-io.

the class OrcReader method intermediateRowIterator.

@Override
protected CloseableIterator<OrcStruct> intermediateRowIterator() throws IOException {
    final Closer closer = Closer.create();
    // We fetch here to cache a copy locally. However, this might need to be changed if we want to split an orc file
    // into several InputSplits in the future.
    final byte[] buffer = new byte[InputEntity.DEFAULT_FETCH_BUFFER_SIZE];
    final CleanableFile file = closer.register(source.fetch(temporaryDirectory, buffer));
    final Path path = new Path(file.file().toURI());
    final ClassLoader currentClassLoader = Thread.currentThread().getContextClassLoader();
    final Reader reader;
    try {
        Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
        reader = closer.register(OrcFile.createReader(path, OrcFile.readerOptions(conf)));
    } finally {
        Thread.currentThread().setContextClassLoader(currentClassLoader);
    }
    // The below line will get the schmea to read the whole columns.
    // This can be improved by projecting some columns only what users want in the future.
    final TypeDescription schema = reader.getSchema();
    final RecordReader batchReader = reader.rows(reader.options());
    final OrcMapredRecordReader<OrcStruct> recordReader = new OrcMapredRecordReader<>(batchReader, schema);
    closer.register(recordReader::close);
    return new CloseableIterator<OrcStruct>() {

        final NullWritable key = recordReader.createKey();

        OrcStruct value = null;

        @Override
        public boolean hasNext() {
            if (value == null) {
                try {
                    // The returned OrcStruct in next() can be kept in memory for a while.
                    // Here, we create a new instance of OrcStruct before calling RecordReader.next(),
                    // so that we can avoid to share the same reference to the "value" across rows.
                    value = recordReader.createValue();
                    if (!recordReader.next(key, value)) {
                        value = null;
                    }
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            return value != null;
        }

        @Override
        public OrcStruct next() {
            if (value == null) {
                throw new NoSuchElementException();
            }
            final OrcStruct currentValue = value;
            value = null;
            return currentValue;
        }

        @Override
        public void close() throws IOException {
            closer.close();
        }
    };
}
Also used : Closer(org.apache.druid.java.util.common.io.Closer) Path(org.apache.hadoop.fs.Path) CloseableIterator(org.apache.druid.java.util.common.parsers.CloseableIterator) RecordReader(org.apache.orc.RecordReader) OrcMapredRecordReader(org.apache.orc.mapred.OrcMapredRecordReader) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) IntermediateRowParsingReader(org.apache.druid.data.input.IntermediateRowParsingReader) OrcMapredRecordReader(org.apache.orc.mapred.OrcMapredRecordReader) IOException(java.io.IOException) NullWritable(org.apache.hadoop.io.NullWritable) OrcStruct(org.apache.orc.mapred.OrcStruct) TypeDescription(org.apache.orc.TypeDescription) OrcMapredRecordReader(org.apache.orc.mapred.OrcMapredRecordReader) CleanableFile(org.apache.druid.data.input.InputEntity.CleanableFile) NoSuchElementException(java.util.NoSuchElementException)

Aggregations

IOException (java.io.IOException)1 NoSuchElementException (java.util.NoSuchElementException)1 CleanableFile (org.apache.druid.data.input.InputEntity.CleanableFile)1 IntermediateRowParsingReader (org.apache.druid.data.input.IntermediateRowParsingReader)1 Closer (org.apache.druid.java.util.common.io.Closer)1 CloseableIterator (org.apache.druid.java.util.common.parsers.CloseableIterator)1 Path (org.apache.hadoop.fs.Path)1 NullWritable (org.apache.hadoop.io.NullWritable)1 Reader (org.apache.orc.Reader)1 RecordReader (org.apache.orc.RecordReader)1 TypeDescription (org.apache.orc.TypeDescription)1 OrcMapredRecordReader (org.apache.orc.mapred.OrcMapredRecordReader)1 OrcStruct (org.apache.orc.mapred.OrcStruct)1