Search in sources :

Example 1 with ColumnVectorBatch

use of org.apache.hadoop.hive.llap.io.api.impl.ColumnVectorBatch in project hive by apache.

the class OrcEncodedDataConsumer method decodeBatch.

@Override
protected void decodeBatch(OrcEncodedColumnBatch batch, Consumer<ColumnVectorBatch> downstreamConsumer) {
    long startTime = counters.startTimeCounter();
    int currentStripeIndex = batch.getBatchKey().stripeIx;
    boolean sameStripe = currentStripeIndex == previousStripeIndex;
    try {
        ConsumerStripeMetadata stripeMetadata = stripes.get(currentStripeIndex);
        // Get non null row count from root column, to get max vector batches
        int rgIdx = batch.getBatchKey().rgIx;
        long nonNullRowCount = -1;
        if (rgIdx == OrcEncodedColumnBatch.ALL_RGS) {
            nonNullRowCount = stripeMetadata.getRowCount();
        } else {
            OrcProto.RowIndexEntry rowIndex = stripeMetadata.getRowIndexEntry(0, rgIdx);
            nonNullRowCount = getRowCount(rowIndex);
        }
        int maxBatchesRG = (int) ((nonNullRowCount / VectorizedRowBatch.DEFAULT_SIZE) + 1);
        int batchSize = VectorizedRowBatch.DEFAULT_SIZE;
        TypeDescription schema = fileMetadata.getSchema();
        if (columnReaders == null || !sameStripe) {
            int[] columnMapping = new int[schema.getChildren().size()];
            TreeReaderFactory.Context context = new TreeReaderFactory.ReaderContext().setSchemaEvolution(evolution).writerTimeZone(stripeMetadata.getWriterTimezone()).skipCorrupt(skipCorrupt);
            StructTreeReader treeReader = EncodedTreeReaderFactory.createRootTreeReader(schema, stripeMetadata.getEncodings(), batch, codec, context, columnMapping);
            this.columnReaders = treeReader.getChildReaders();
            this.columnMapping = Arrays.copyOf(columnMapping, columnReaders.length);
            positionInStreams(columnReaders, batch.getBatchKey(), stripeMetadata);
        } else {
            repositionInStreams(this.columnReaders, batch, sameStripe, stripeMetadata);
        }
        previousStripeIndex = currentStripeIndex;
        for (int i = 0; i < maxBatchesRG; i++) {
            // for last batch in row group, adjust the batch size
            if (i == maxBatchesRG - 1) {
                batchSize = (int) (nonNullRowCount % VectorizedRowBatch.DEFAULT_SIZE);
                if (batchSize == 0)
                    break;
            }
            ColumnVectorBatch cvb = cvbPool.take();
            // assert cvb.cols.length == batch.getColumnIxs().length; // Must be constant per split.
            cvb.size = batchSize;
            for (int idx = 0; idx < columnReaders.length; ++idx) {
                TreeReader reader = columnReaders[idx];
                if (cvb.cols[idx] == null) {
                    // Orc store rows inside a root struct (hive writes it this way).
                    // When we populate column vectors we skip over the root struct.
                    cvb.cols[idx] = createColumn(schema.getChildren().get(columnMapping[idx]), batchSize);
                }
                cvb.cols[idx].ensureSize(batchSize, false);
                reader.nextVector(cvb.cols[idx], null, batchSize);
            }
            // we are done reading a batch, send it to consumer for processing
            downstreamConsumer.consumeData(cvb);
            counters.incrCounter(LlapIOCounters.ROWS_EMITTED, batchSize);
        }
        LlapIoImpl.ORC_LOGGER.debug("Done with decode");
        counters.incrTimeCounter(LlapIOCounters.DECODE_TIME_NS, startTime);
        counters.incrCounter(LlapIOCounters.NUM_VECTOR_BATCHES, maxBatchesRG);
        counters.incrCounter(LlapIOCounters.NUM_DECODED_BATCHES);
    } catch (IOException e) {
        // Caller will return the batch.
        downstreamConsumer.setError(e);
    }
}
Also used : OrcProto(org.apache.orc.OrcProto) TreeReader(org.apache.orc.impl.TreeReaderFactory.TreeReader) SettableTreeReader(org.apache.hadoop.hive.ql.io.orc.encoded.EncodedTreeReaderFactory.SettableTreeReader) StructTreeReader(org.apache.orc.impl.TreeReaderFactory.StructTreeReader) IOException(java.io.IOException) StructTreeReader(org.apache.orc.impl.TreeReaderFactory.StructTreeReader) ConsumerStripeMetadata(org.apache.hadoop.hive.llap.io.metadata.ConsumerStripeMetadata) TypeDescription(org.apache.orc.TypeDescription) EncodedTreeReaderFactory(org.apache.hadoop.hive.ql.io.orc.encoded.EncodedTreeReaderFactory) TreeReaderFactory(org.apache.orc.impl.TreeReaderFactory) ColumnVectorBatch(org.apache.hadoop.hive.llap.io.api.impl.ColumnVectorBatch)

Aggregations

IOException (java.io.IOException)1 ColumnVectorBatch (org.apache.hadoop.hive.llap.io.api.impl.ColumnVectorBatch)1 ConsumerStripeMetadata (org.apache.hadoop.hive.llap.io.metadata.ConsumerStripeMetadata)1 EncodedTreeReaderFactory (org.apache.hadoop.hive.ql.io.orc.encoded.EncodedTreeReaderFactory)1 SettableTreeReader (org.apache.hadoop.hive.ql.io.orc.encoded.EncodedTreeReaderFactory.SettableTreeReader)1 OrcProto (org.apache.orc.OrcProto)1 TypeDescription (org.apache.orc.TypeDescription)1 TreeReaderFactory (org.apache.orc.impl.TreeReaderFactory)1 StructTreeReader (org.apache.orc.impl.TreeReaderFactory.StructTreeReader)1 TreeReader (org.apache.orc.impl.TreeReaderFactory.TreeReader)1