Search in sources :

Example 1 with Reader

use of org.apache.hadoop.hive.ql.io.orc.encoded.Reader in project hive by apache.

the class OrcEncodedDataReader method getOrcTailForPath.

/**
 * Looks up metadata for the given Orc file in the cache. Will read it in, in case of a cache miss.
 * @param path
 * @param jobConf
 * @param tag
 * @param daemonConf
 * @param metadataCache
 * @return
 * @throws IOException
 */
public static OrcTail getOrcTailForPath(Path path, Configuration jobConf, CacheTag tag, Configuration daemonConf, MetadataCache metadataCache, Object fileKey) throws IOException {
    Supplier<FileSystem> fsSupplier = getFsSupplier(path, jobConf);
    if (fileKey == null) {
        fileKey = determineFileId(fsSupplier, path, daemonConf);
    }
    if (fileKey == null || metadataCache == null) {
        throw new IllegalCacheConfigurationException("LLAP metadata cache not available for path " + path.toString());
    }
    LlapBufferOrBuffers tailBuffers = metadataCache.getFileMetadata(fileKey);
    try {
        // Cache hit
        if (tailBuffers != null) {
            return getOrcTailFromLlapBuffers(tailBuffers);
        }
        // Cache miss
        throwIfCacheOnlyRead(HiveConf.getBoolVar(jobConf, ConfVars.LLAP_IO_CACHE_ONLY));
        ReaderOptions opts = EncodedOrcFile.readerOptions(jobConf).filesystem(fsSupplier);
        Reader reader = EncodedOrcFile.createReader(path, opts);
        ByteBuffer tailBufferBb = reader.getSerializedFileFooter();
        tailBuffers = metadataCache.putFileMetadata(fileKey, tailBufferBb, tag, new AtomicBoolean(false));
        return getOrcTailFromLlapBuffers(tailBuffers);
    } finally {
        // By this time buffers got locked at either cache look up or cache insert times.
        if (tailBuffers != null) {
            metadataCache.decRefBuffer(tailBuffers);
        }
    }
}
Also used : ReaderOptions(org.apache.hadoop.hive.ql.io.orc.OrcFile.ReaderOptions) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) FileSystem(org.apache.hadoop.fs.FileSystem) Reader(org.apache.hadoop.hive.ql.io.orc.encoded.Reader) LlapDataReader(org.apache.hadoop.hive.ql.io.orc.encoded.LlapDataReader) EncodedReader(org.apache.hadoop.hive.ql.io.orc.encoded.EncodedReader) IllegalCacheConfigurationException(org.apache.hadoop.hive.llap.IllegalCacheConfigurationException) LlapBufferOrBuffers(org.apache.hadoop.hive.llap.io.metadata.MetadataCache.LlapBufferOrBuffers) ByteBuffer(java.nio.ByteBuffer)

Example 2 with Reader

use of org.apache.hadoop.hive.ql.io.orc.encoded.Reader in project hive by apache.

the class LlapRecordReader method next.

@Override
public boolean next(NullWritable key, VectorizedRowBatch vrb) throws IOException {
    assert vrb != null;
    if (isClosed) {
        throw new AssertionError("next called after close");
    }
    // Add partition cols if necessary (see VectorizedOrcInputFormat for details).
    boolean wasFirst = isFirst;
    if (isFirst) {
        if (partitionValues != null) {
            rbCtx.addPartitionColsToBatch(vrb, partitionValues);
        }
        isFirst = false;
    }
    ColumnVectorBatch cvb;
    try {
        cvb = nextCvb();
    } catch (InterruptedException e) {
        // Query might have been canceled. Stop the background processing.
        feedback.stop();
        // In case we are stuck in consume.
        isInterrupted = true;
        throw new IOException(e);
    }
    if (cvb == null) {
        if (wasFirst) {
            firstReturnTime = counters.startTimeCounter();
        }
        counters.incrWallClockCounter(LlapIOCounters.CONSUMER_TIME_NS, firstReturnTime);
        return false;
    }
    if (isAcidFormat) {
        // why?
        vrb.selectedInUse = true;
        if (isVectorized) {
            // TODO: relying everywhere on the magical constants and columns being together means ACID
            // columns are going to be super hard to change in a backward compat manner. I can
            // foresee someone cursing while refactoring all the magic for prefix schema changes.
            /*
          Acid meta cols are always either all included or all excluded the
          the width of 'cvb' changes accordingly so 'acidColCount' and
          'ixInVrb' need to be adjusted. See {@link IncludesImpl} comments.
         */
            // Exclude the row column.
            int acidColCount = acidReader.includeAcidColumns() ? OrcInputFormat.getRootColumn(false) - 1 : 0;
            ensureAcidInputVrb(acidColCount, vrb.getDataColumnCount());
            // By assumption, ACID columns are currently always in the beginning of the arrays.
            System.arraycopy(cvb.cols, 0, acidInputVrb.cols, 0, acidColCount);
            for (int ixInReadSet = acidColCount; ixInReadSet < cvb.cols.length; ++ixInReadSet) {
                int ixInVrb = includes.getPhysicalColumnIds().get(ixInReadSet) - (acidReader.includeAcidColumns() ? 0 : OrcRecordUpdater.ROW);
                cvb.swapColumnVector(ixInReadSet, acidInputVrb.cols, ixInVrb);
            }
            acidInputVrb.size = cvb.size;
            acidReader.setBaseAndInnerReader(new AcidWrapper(acidInputVrb));
            acidReader.next(NullWritable.get(), vrb);
        } else {
            // TODO: WTF? The old code seems to just drop the ball here.
            throw new AssertionError("Unsupported mode");
        }
    } else {
        List<Integer> logicalOrderedColumnIds = includes.getLogicalOrderedColumnIds();
        long cvbColsPresent = Arrays.stream(cvb.cols).filter(Objects::nonNull).count();
        if (logicalOrderedColumnIds.size() != cvbColsPresent) {
            throw new RuntimeException("Unexpected number of columns, VRB has " + logicalOrderedColumnIds.size() + " included, but the reader returned " + cvbColsPresent);
        }
        // schema, they are dealt with later.
        for (int ixInReadSet = 0; ixInReadSet < cvbColsPresent; ++ixInReadSet) {
            int ixInVrb = logicalOrderedColumnIds.get(ixInReadSet);
            cvb.swapColumnVector(ixInReadSet, vrb.cols, ixInVrb);
        }
        // null out col vectors for which the (ORC) file had no data
        List<Integer> missingColIndices = includes.getReaderLogicalColumnIds().stream().filter(idx -> !includes.getLogicalOrderedColumnIds().contains(idx)).collect(toList());
        if (missingColIndices.size() != (cvb.cols.length - cvbColsPresent)) {
            throw new RuntimeException("Unexpected number of missing columns, expected " + missingColIndices.size() + ", but reader returned " + (cvb.cols.length - cvbColsPresent) + " missing column vectors.");
        }
        for (int index : missingColIndices) {
            vrb.cols[index].noNulls = false;
            vrb.cols[index].isRepeating = true;
            vrb.cols[index].isNull[0] = true;
        }
        // why?
        vrb.selectedInUse = false;
        vrb.size = cvb.size;
    }
    if (wasFirst) {
        firstReturnTime = counters.startTimeCounter();
    }
    if (bucketIdentifier != null) {
        rbCtx.setBucketAndWriteIdOf(vrb, bucketIdentifier);
    }
    return true;
}
Also used : OrcRecordUpdater(org.apache.hadoop.hive.ql.io.orc.OrcRecordUpdater) Arrays(java.util.Arrays) SchemaEvolution(org.apache.orc.impl.SchemaEvolution) NullWritable(org.apache.hadoop.io.NullWritable) Includes(org.apache.hadoop.hive.llap.io.decode.ColumnVectorProducer.Includes) ConfVars(org.apache.hadoop.hive.conf.HiveConf.ConfVars) ColumnProjectionUtils(org.apache.hadoop.hive.serde2.ColumnProjectionUtils) LoggerFactory(org.slf4j.LoggerFactory) Reader(org.apache.hadoop.hive.ql.io.orc.encoded.Reader) Matcher(java.util.regex.Matcher) FileSplit(org.apache.hadoop.mapred.FileSplit) SchemaEvolutionFactory(org.apache.hadoop.hive.llap.io.decode.ColumnVectorProducer.SchemaEvolutionFactory) OrcConf(org.apache.orc.OrcConf) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) InputFormat(org.apache.hadoop.mapred.InputFormat) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) FragmentCountersMap(org.apache.hadoop.hive.llap.counters.FragmentCountersMap) LlapTezUtils(org.apache.hadoop.hive.llap.tezplugins.LlapTezUtils) StatsRecordingThreadPool(org.apache.hadoop.hive.llap.daemon.impl.StatsRecordingThreadPool) Objects(java.util.Objects) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) List(java.util.List) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) VectorizedOrcAcidRowBatchReader(org.apache.hadoop.hive.ql.io.orc.VectorizedOrcAcidRowBatchReader) RecordReader(org.apache.hadoop.mapred.RecordReader) Pattern(java.util.regex.Pattern) QueryFragmentCounters(org.apache.hadoop.hive.llap.counters.QueryFragmentCounters) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ConvertAstToSearchArg(org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg) LlapIOCounters(org.apache.hadoop.hive.llap.counters.LlapIOCounters) HashMap(java.util.HashMap) BucketIdentifier(org.apache.hadoop.hive.ql.io.BucketIdentifier) AtomicReference(java.util.concurrent.atomic.AtomicReference) OrcSplit(org.apache.hadoop.hive.ql.io.orc.OrcSplit) ArrayList(java.util.ArrayList) SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) Category(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category) LinkedList(java.util.LinkedList) ConsumerFeedback(org.apache.hadoop.hive.llap.ConsumerFeedback) ExecutorService(java.util.concurrent.ExecutorService) ReadPipeline(org.apache.hadoop.hive.llap.io.decode.ReadPipeline) Logger(org.slf4j.Logger) Reporter(org.apache.hadoop.mapred.Reporter) HiveConf(org.apache.hadoop.hive.conf.HiveConf) VectorizedRowBatchCtx(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) DecimalTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Consumer(org.apache.hadoop.hive.ql.io.orc.encoded.Consumer) TimeUnit(java.util.concurrent.TimeUnit) LlapHiveUtils(org.apache.hadoop.hive.llap.LlapHiveUtils) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) JobConf(org.apache.hadoop.mapred.JobConf) TezCounters(org.apache.tez.common.counters.TezCounters) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) Collectors.toList(java.util.stream.Collectors.toList) LlapHiveUtils.throwIfCacheOnlyRead(org.apache.hadoop.hive.llap.LlapHiveUtils.throwIfCacheOnlyRead) MDC(org.slf4j.MDC) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) VisibleForTesting(com.google.common.annotations.VisibleForTesting) ColumnVectorProducer(org.apache.hadoop.hive.llap.io.decode.ColumnVectorProducer) IOException(java.io.IOException)

Aggregations

Reader (org.apache.hadoop.hive.ql.io.orc.encoded.Reader)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 IOException (java.io.IOException)1 ByteBuffer (java.nio.ByteBuffer)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 HashMap (java.util.HashMap)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Map (java.util.Map)1 Objects (java.util.Objects)1 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)1 ExecutorService (java.util.concurrent.ExecutorService)1 TimeUnit (java.util.concurrent.TimeUnit)1 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 Collectors.toList (java.util.stream.Collectors.toList)1 Configuration (org.apache.hadoop.conf.Configuration)1