use of org.apache.orc.impl.TreeReaderFactory.TreeReader in project hive by apache.
the class OrcEncodedDataConsumer method repositionInStreams.
private void repositionInStreams(TreeReaderFactory.TreeReader[] columnReaders, EncodedColumnBatch<OrcBatchKey> batch, boolean sameStripe, ConsumerStripeMetadata stripeMetadata) throws IOException {
PositionProvider[] pps = createPositionProviders(columnReaders, batch.getBatchKey(), stripeMetadata);
if (pps == null)
return;
for (int i = 0; i < columnReaders.length; i++) {
TreeReader reader = columnReaders[i];
// Note: we assume this never happens for SerDe reader - the batch would never have vectors.
// That is always true now; but it wasn't some day, the below would throw in getColumnData.
((SettableTreeReader) reader).setBuffers(batch, sameStripe);
// SettableTreeReader so that we can avoid this check.
if (reader instanceof EncodedTreeReaderFactory.TimestampStreamReader && !sameStripe) {
((EncodedTreeReaderFactory.TimestampStreamReader) reader).updateTimezone(stripeMetadata.getWriterTimezone());
}
reader.seek(pps);
}
}
use of org.apache.orc.impl.TreeReaderFactory.TreeReader in project hive by apache.
the class OrcEncodedDataConsumer method decodeBatch.
@Override
protected void decodeBatch(OrcEncodedColumnBatch batch, Consumer<ColumnVectorBatch> downstreamConsumer) {
long startTime = counters.startTimeCounter();
int currentStripeIndex = batch.getBatchKey().stripeIx;
boolean sameStripe = currentStripeIndex == previousStripeIndex;
try {
ConsumerStripeMetadata stripeMetadata = stripes.get(currentStripeIndex);
// Get non null row count from root column, to get max vector batches
int rgIdx = batch.getBatchKey().rgIx;
long nonNullRowCount = -1;
if (rgIdx == OrcEncodedColumnBatch.ALL_RGS) {
nonNullRowCount = stripeMetadata.getRowCount();
} else {
OrcProto.RowIndexEntry rowIndex = stripeMetadata.getRowIndexEntry(0, rgIdx);
nonNullRowCount = getRowCount(rowIndex);
}
int maxBatchesRG = (int) ((nonNullRowCount / VectorizedRowBatch.DEFAULT_SIZE) + 1);
int batchSize = VectorizedRowBatch.DEFAULT_SIZE;
TypeDescription schema = fileMetadata.getSchema();
if (columnReaders == null || !sameStripe) {
int[] columnMapping = new int[schema.getChildren().size()];
TreeReaderFactory.Context context = new TreeReaderFactory.ReaderContext().setSchemaEvolution(evolution).writerTimeZone(stripeMetadata.getWriterTimezone()).skipCorrupt(skipCorrupt);
StructTreeReader treeReader = EncodedTreeReaderFactory.createRootTreeReader(schema, stripeMetadata.getEncodings(), batch, codec, context, columnMapping);
this.columnReaders = treeReader.getChildReaders();
this.columnMapping = Arrays.copyOf(columnMapping, columnReaders.length);
positionInStreams(columnReaders, batch.getBatchKey(), stripeMetadata);
} else {
repositionInStreams(this.columnReaders, batch, sameStripe, stripeMetadata);
}
previousStripeIndex = currentStripeIndex;
for (int i = 0; i < maxBatchesRG; i++) {
// for last batch in row group, adjust the batch size
if (i == maxBatchesRG - 1) {
batchSize = (int) (nonNullRowCount % VectorizedRowBatch.DEFAULT_SIZE);
if (batchSize == 0)
break;
}
ColumnVectorBatch cvb = cvbPool.take();
// assert cvb.cols.length == batch.getColumnIxs().length; // Must be constant per split.
cvb.size = batchSize;
for (int idx = 0; idx < columnReaders.length; ++idx) {
TreeReader reader = columnReaders[idx];
if (cvb.cols[idx] == null) {
// Orc store rows inside a root struct (hive writes it this way).
// When we populate column vectors we skip over the root struct.
cvb.cols[idx] = createColumn(schema.getChildren().get(columnMapping[idx]), batchSize);
}
cvb.cols[idx].ensureSize(batchSize, false);
reader.nextVector(cvb.cols[idx], null, batchSize);
}
// we are done reading a batch, send it to consumer for processing
downstreamConsumer.consumeData(cvb);
counters.incrCounter(LlapIOCounters.ROWS_EMITTED, batchSize);
}
LlapIoImpl.ORC_LOGGER.debug("Done with decode");
counters.incrTimeCounter(LlapIOCounters.DECODE_TIME_NS, startTime);
counters.incrCounter(LlapIOCounters.NUM_VECTOR_BATCHES, maxBatchesRG);
counters.incrCounter(LlapIOCounters.NUM_DECODED_BATCHES);
} catch (IOException e) {
// Caller will return the batch.
downstreamConsumer.setError(e);
}
}
Aggregations