Search in sources :

Example 1 with BucketIdentifier

use of org.apache.hadoop.hive.ql.io.BucketIdentifier in project hive by apache.

the class LlapRecordReader method next.

@Override
public boolean next(NullWritable key, VectorizedRowBatch vrb) throws IOException {
    assert vrb != null;
    if (isClosed) {
        throw new AssertionError("next called after close");
    }
    // Add partition cols if necessary (see VectorizedOrcInputFormat for details).
    boolean wasFirst = isFirst;
    if (isFirst) {
        if (partitionValues != null) {
            rbCtx.addPartitionColsToBatch(vrb, partitionValues);
        }
        isFirst = false;
    }
    ColumnVectorBatch cvb;
    try {
        cvb = nextCvb();
    } catch (InterruptedException e) {
        // Query might have been canceled. Stop the background processing.
        feedback.stop();
        // In case we are stuck in consume.
        isInterrupted = true;
        throw new IOException(e);
    }
    if (cvb == null) {
        if (wasFirst) {
            firstReturnTime = counters.startTimeCounter();
        }
        counters.incrWallClockCounter(LlapIOCounters.CONSUMER_TIME_NS, firstReturnTime);
        return false;
    }
    if (isAcidFormat) {
        // why?
        vrb.selectedInUse = true;
        if (isVectorized) {
            // TODO: relying everywhere on the magical constants and columns being together means ACID
            // columns are going to be super hard to change in a backward compat manner. I can
            // foresee someone cursing while refactoring all the magic for prefix schema changes.
            /*
          Acid meta cols are always either all included or all excluded the
          the width of 'cvb' changes accordingly so 'acidColCount' and
          'ixInVrb' need to be adjusted. See {@link IncludesImpl} comments.
         */
            // Exclude the row column.
            int acidColCount = acidReader.includeAcidColumns() ? OrcInputFormat.getRootColumn(false) - 1 : 0;
            ensureAcidInputVrb(acidColCount, vrb.getDataColumnCount());
            // By assumption, ACID columns are currently always in the beginning of the arrays.
            System.arraycopy(cvb.cols, 0, acidInputVrb.cols, 0, acidColCount);
            for (int ixInReadSet = acidColCount; ixInReadSet < cvb.cols.length; ++ixInReadSet) {
                int ixInVrb = includes.getPhysicalColumnIds().get(ixInReadSet) - (acidReader.includeAcidColumns() ? 0 : OrcRecordUpdater.ROW);
                cvb.swapColumnVector(ixInReadSet, acidInputVrb.cols, ixInVrb);
            }
            acidInputVrb.size = cvb.size;
            acidReader.setBaseAndInnerReader(new AcidWrapper(acidInputVrb));
            acidReader.next(NullWritable.get(), vrb);
        } else {
            // TODO: WTF? The old code seems to just drop the ball here.
            throw new AssertionError("Unsupported mode");
        }
    } else {
        List<Integer> logicalOrderedColumnIds = includes.getLogicalOrderedColumnIds();
        long cvbColsPresent = Arrays.stream(cvb.cols).filter(Objects::nonNull).count();
        if (logicalOrderedColumnIds.size() != cvbColsPresent) {
            throw new RuntimeException("Unexpected number of columns, VRB has " + logicalOrderedColumnIds.size() + " included, but the reader returned " + cvbColsPresent);
        }
        // schema, they are dealt with later.
        for (int ixInReadSet = 0; ixInReadSet < cvbColsPresent; ++ixInReadSet) {
            int ixInVrb = logicalOrderedColumnIds.get(ixInReadSet);
            cvb.swapColumnVector(ixInReadSet, vrb.cols, ixInVrb);
        }
        // null out col vectors for which the (ORC) file had no data
        List<Integer> missingColIndices = includes.getReaderLogicalColumnIds().stream().filter(idx -> !includes.getLogicalOrderedColumnIds().contains(idx)).collect(toList());
        if (missingColIndices.size() != (cvb.cols.length - cvbColsPresent)) {
            throw new RuntimeException("Unexpected number of missing columns, expected " + missingColIndices.size() + ", but reader returned " + (cvb.cols.length - cvbColsPresent) + " missing column vectors.");
        }
        for (int index : missingColIndices) {
            vrb.cols[index].noNulls = false;
            vrb.cols[index].isRepeating = true;
            vrb.cols[index].isNull[0] = true;
        }
        // why?
        vrb.selectedInUse = false;
        vrb.size = cvb.size;
    }
    if (wasFirst) {
        firstReturnTime = counters.startTimeCounter();
    }
    if (bucketIdentifier != null) {
        rbCtx.setBucketAndWriteIdOf(vrb, bucketIdentifier);
    }
    return true;
}
Also used : OrcRecordUpdater(org.apache.hadoop.hive.ql.io.orc.OrcRecordUpdater) Arrays(java.util.Arrays) SchemaEvolution(org.apache.orc.impl.SchemaEvolution) NullWritable(org.apache.hadoop.io.NullWritable) Includes(org.apache.hadoop.hive.llap.io.decode.ColumnVectorProducer.Includes) ConfVars(org.apache.hadoop.hive.conf.HiveConf.ConfVars) ColumnProjectionUtils(org.apache.hadoop.hive.serde2.ColumnProjectionUtils) LoggerFactory(org.slf4j.LoggerFactory) Reader(org.apache.hadoop.hive.ql.io.orc.encoded.Reader) Matcher(java.util.regex.Matcher) FileSplit(org.apache.hadoop.mapred.FileSplit) SchemaEvolutionFactory(org.apache.hadoop.hive.llap.io.decode.ColumnVectorProducer.SchemaEvolutionFactory) OrcConf(org.apache.orc.OrcConf) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) InputFormat(org.apache.hadoop.mapred.InputFormat) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) FragmentCountersMap(org.apache.hadoop.hive.llap.counters.FragmentCountersMap) LlapTezUtils(org.apache.hadoop.hive.llap.tezplugins.LlapTezUtils) StatsRecordingThreadPool(org.apache.hadoop.hive.llap.daemon.impl.StatsRecordingThreadPool) Objects(java.util.Objects) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) List(java.util.List) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) VectorizedOrcAcidRowBatchReader(org.apache.hadoop.hive.ql.io.orc.VectorizedOrcAcidRowBatchReader) RecordReader(org.apache.hadoop.mapred.RecordReader) Pattern(java.util.regex.Pattern) QueryFragmentCounters(org.apache.hadoop.hive.llap.counters.QueryFragmentCounters) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ConvertAstToSearchArg(org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg) LlapIOCounters(org.apache.hadoop.hive.llap.counters.LlapIOCounters) HashMap(java.util.HashMap) BucketIdentifier(org.apache.hadoop.hive.ql.io.BucketIdentifier) AtomicReference(java.util.concurrent.atomic.AtomicReference) OrcSplit(org.apache.hadoop.hive.ql.io.orc.OrcSplit) ArrayList(java.util.ArrayList) SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) Category(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category) LinkedList(java.util.LinkedList) ConsumerFeedback(org.apache.hadoop.hive.llap.ConsumerFeedback) ExecutorService(java.util.concurrent.ExecutorService) ReadPipeline(org.apache.hadoop.hive.llap.io.decode.ReadPipeline) Logger(org.slf4j.Logger) Reporter(org.apache.hadoop.mapred.Reporter) HiveConf(org.apache.hadoop.hive.conf.HiveConf) VectorizedRowBatchCtx(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) DecimalTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Consumer(org.apache.hadoop.hive.ql.io.orc.encoded.Consumer) TimeUnit(java.util.concurrent.TimeUnit) LlapHiveUtils(org.apache.hadoop.hive.llap.LlapHiveUtils) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) JobConf(org.apache.hadoop.mapred.JobConf) TezCounters(org.apache.tez.common.counters.TezCounters) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) Collectors.toList(java.util.stream.Collectors.toList) LlapHiveUtils.throwIfCacheOnlyRead(org.apache.hadoop.hive.llap.LlapHiveUtils.throwIfCacheOnlyRead) MDC(org.slf4j.MDC) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) VisibleForTesting(com.google.common.annotations.VisibleForTesting) ColumnVectorProducer(org.apache.hadoop.hive.llap.io.decode.ColumnVectorProducer) IOException(java.io.IOException)

Aggregations

VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 HashMap (java.util.HashMap)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Map (java.util.Map)1 Objects (java.util.Objects)1 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)1 ExecutorService (java.util.concurrent.ExecutorService)1 TimeUnit (java.util.concurrent.TimeUnit)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 Collectors.toList (java.util.stream.Collectors.toList)1 Configuration (org.apache.hadoop.conf.Configuration)1 HiveConf (org.apache.hadoop.hive.conf.HiveConf)1 ConfVars (org.apache.hadoop.hive.conf.HiveConf.ConfVars)1 ConsumerFeedback (org.apache.hadoop.hive.llap.ConsumerFeedback)1