Search in sources :

Example 1 with Stream

use of org.apache.orc.OrcProto.Stream in project hive by apache.

the class EncodedReaderImpl method readIndexStreams.

@Override
public void readIndexStreams(OrcIndex index, StripeInformation stripe, List<OrcProto.Stream> streams, boolean[] physicalFileIncludes, boolean[] sargColumns) throws IOException {
    long stripeOffset = stripe.getOffset();
    DiskRangeList indexRanges = planIndexReading(fileSchema, streams, true, physicalFileIncludes, sargColumns, version, index.getBloomFilterKinds());
    if (indexRanges == null) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Nothing to read for stripe [" + stripe + "]");
        }
        return;
    }
    ReadContext[] colCtxs = new ReadContext[physicalFileIncludes.length];
    int colRgIx = -1;
    for (int i = 0; i < physicalFileIncludes.length; ++i) {
        if (!physicalFileIncludes[i] && (sargColumns == null || !sargColumns[i]))
            continue;
        colCtxs[i] = new ReadContext(i, ++colRgIx);
        if (isTracingEnabled) {
            LOG.trace("Creating context: " + colCtxs[i].toString());
        }
        // Bogus encoding.
        trace.logColumnRead(i, colRgIx, ColumnEncoding.Kind.DIRECT);
    }
    long offset = 0;
    for (OrcProto.Stream stream : streams) {
        long length = stream.getLength();
        int colIx = stream.getColumn();
        OrcProto.Stream.Kind streamKind = stream.getKind();
        // See planIndexReading - only read non-row-index streams if involved in SARGs.
        if ((StreamName.getArea(streamKind) == StreamName.Area.INDEX) && ((sargColumns != null && sargColumns[colIx]) || (physicalFileIncludes[colIx] && streamKind == Kind.ROW_INDEX))) {
            trace.logAddStream(colIx, streamKind, offset, length, -1, true);
            colCtxs[colIx].addStream(offset, stream, -1);
            if (isTracingEnabled) {
                LOG.trace("Adding stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length);
            }
        }
        offset += length;
    }
    boolean hasFileId = this.fileKey != null;
    // 2. Now, read all of the ranges from cache or disk.
    IdentityHashMap<ByteBuffer, Boolean> toRelease = new IdentityHashMap<>();
    MutateHelper toRead = getDataFromCacheAndDisk(indexRanges, stripeOffset, hasFileId, toRelease);
    // 3. For uncompressed case, we need some special processing before read.
    DiskRangeList iter = preReadUncompressedStreams(stripeOffset, colCtxs, toRead, toRelease);
    // 4. Decompress the data.
    boolean hasError = true;
    try {
        for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
            ReadContext ctx = colCtxs[colIx];
            // This column is not included.
            if (ctx == null)
                continue;
            for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
                StreamContext sctx = ctx.streams[streamIx];
                try {
                    if (isTracingEnabled) {
                        LOG.trace("Getting index stream " + sctx.kind + " for column " + ctx.colIx + " at " + sctx.offset + ", " + sctx.length);
                    }
                    ColumnStreamData csd = POOLS.csdPool.take();
                    long endCOffset = sctx.offset + sctx.length;
                    DiskRangeList lastCached = readEncodedStream(stripeOffset, iter, sctx.offset, endCOffset, csd, endCOffset, sctx.offset, toRelease);
                    if (lastCached != null) {
                        iter = lastCached;
                    }
                    if (isTracingEnabled) {
                        traceLogBuffersUsedToParse(csd);
                    }
                    CodedInputStream cis = CodedInputStream.newInstance(new IndexStream(csd.getCacheBuffers(), sctx.length));
                    cis.setSizeLimit(InStream.PROTOBUF_MESSAGE_MAX_LIMIT);
                    switch(sctx.kind) {
                        case ROW_INDEX:
                            OrcProto.RowIndex tmp = index.getRowGroupIndex()[colIx] = OrcProto.RowIndex.parseFrom(cis);
                            if (isTracingEnabled) {
                                LOG.trace("Index is " + tmp.toString().replace('\n', ' '));
                            }
                            break;
                        case BLOOM_FILTER:
                        case BLOOM_FILTER_UTF8:
                            index.getBloomFilterIndex()[colIx] = OrcProto.BloomFilterIndex.parseFrom(cis);
                            break;
                        default:
                            throw new AssertionError("Unexpected index stream type " + sctx.kind);
                    }
                    // We are done with the buffers; unlike data blocks, we are also the consumer. Release.
                    for (MemoryBuffer buf : csd.getCacheBuffers()) {
                        if (buf == null)
                            continue;
                        cacheWrapper.releaseBuffer(buf);
                    }
                } catch (Exception ex) {
                    DiskRangeList drl = toRead == null ? null : toRead.next;
                    LOG.error("Error getting stream " + sctx.kind + " for column " + ctx.colIx + " at " + sctx.offset + ", " + sctx.length + "; toRead " + RecordReaderUtils.stringifyDiskRanges(drl), ex);
                    throw (ex instanceof IOException) ? (IOException) ex : new IOException(ex);
                }
            }
        }
        if (isTracingEnabled) {
            LOG.trace("Disk ranges after preparing all the data " + RecordReaderUtils.stringifyDiskRanges(toRead.next));
        }
        hasError = false;
    } finally {
        // Release the unreleased buffers. See class comment about refcounts.
        try {
            releaseInitialRefcounts(toRead.next);
            releaseBuffers(toRelease.keySet(), true);
        } catch (Throwable t) {
            if (!hasError)
                throw new IOException(t);
            LOG.error("Error during the cleanup after another error; ignoring", t);
        }
    }
}
Also used : DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) CodedInputStream(com.google.protobuf.CodedInputStream) OrcProto(org.apache.orc.OrcProto) IdentityHashMap(java.util.IdentityHashMap) Stream(org.apache.orc.OrcProto.Stream) MutateHelper(org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper) Kind(org.apache.orc.OrcProto.Stream.Kind) OutStream(org.apache.orc.impl.OutStream) Stream(org.apache.orc.OrcProto.Stream) InStream(org.apache.orc.impl.InStream) CodedInputStream(com.google.protobuf.CodedInputStream) InputStream(java.io.InputStream) ColumnStreamData(org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData) IOException(java.io.IOException) ByteBuffer(java.nio.ByteBuffer) IOException(java.io.IOException) MemoryBuffer(org.apache.hadoop.hive.common.io.encoded.MemoryBuffer)

Example 2 with Stream

use of org.apache.orc.OrcProto.Stream in project hive by apache.

the class EncodedReaderImpl method planIndexReading.

// TODO: temporary, need to expose from ORC utils (note the difference in null checks)
static DiskRangeList planIndexReading(TypeDescription fileSchema, List<OrcProto.Stream> streams, boolean ignoreNonUtf8BloomFilter, boolean[] fileIncluded, boolean[] sargColumns, WriterVersion version, OrcProto.Stream.Kind[] bloomFilterKinds) {
    DiskRangeList.CreateHelper result = new DiskRangeList.CreateHelper();
    // picks bloom_filter_utf8 if its available, otherwise bloom_filter
    if (sargColumns != null) {
        for (OrcProto.Stream stream : streams) {
            if (stream.hasKind() && stream.hasColumn()) {
                int column = stream.getColumn();
                if (sargColumns[column]) {
                    switch(stream.getKind()) {
                        case BLOOM_FILTER:
                            if (bloomFilterKinds[column] == null && !(ignoreNonUtf8BloomFilter && hadBadBloomFilters(fileSchema.findSubtype(column).getCategory(), version))) {
                                bloomFilterKinds[column] = OrcProto.Stream.Kind.BLOOM_FILTER;
                            }
                            break;
                        case BLOOM_FILTER_UTF8:
                            bloomFilterKinds[column] = OrcProto.Stream.Kind.BLOOM_FILTER_UTF8;
                            break;
                        default:
                            break;
                    }
                }
            }
        }
    }
    long offset = 0;
    for (OrcProto.Stream stream : streams) {
        if (stream.hasKind() && stream.hasColumn()) {
            int column = stream.getColumn();
            if (fileIncluded == null || fileIncluded[column]) {
                boolean needStream = false;
                switch(stream.getKind()) {
                    case ROW_INDEX:
                        needStream = true;
                        break;
                    case BLOOM_FILTER:
                    case BLOOM_FILTER_UTF8:
                        needStream = (sargColumns != null) && (bloomFilterKinds[column] == stream.getKind());
                        break;
                    default:
                        // PASS
                        break;
                }
                if (needStream) {
                    result.addOrMerge(offset, offset + stream.getLength(), true, false);
                }
            }
        }
        offset += stream.getLength();
    }
    return result.get();
}
Also used : CreateHelper(org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper) Stream(org.apache.orc.OrcProto.Stream) DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) CreateHelper(org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper) OrcProto(org.apache.orc.OrcProto)

Example 3 with Stream

use of org.apache.orc.OrcProto.Stream in project hive by apache.

the class EncodedReaderImpl method readEncodedColumns.

@Override
public void readEncodedColumns(int stripeIx, StripeInformation stripe, OrcProto.RowIndex[] indexes, List<OrcProto.ColumnEncoding> encodings, List<OrcProto.Stream> streamList, boolean[] physicalFileIncludes, boolean[] rgs, Consumer<OrcEncodedColumnBatch> consumer) throws IOException {
    // Note: for now we don't have to setError here, caller will setError if we throw.
    // We are also not supposed to call setDone, since we are only part of the operation.
    long stripeOffset = stripe.getOffset();
    // 1. Figure out what we have to read.
    // Stream offset in relation to the stripe.
    long offset = 0;
    // 1.1. Figure out which columns have a present stream
    boolean[] hasNull = RecordReaderUtils.findPresentStreamsByColumn(streamList, types);
    if (isTracingEnabled) {
        LOG.trace("The following columns have PRESENT streams: " + arrayToString(hasNull));
    }
    // We assume stream list is sorted by column and that non-data
    // streams do not interleave data streams for the same column.
    // 1.2. With that in mind, determine disk ranges to read/get from cache (not by stream).
    ColumnReadContext[] colCtxs = new ColumnReadContext[physicalFileIncludes.length];
    int colRgIx = -1;
    // Don't create context for the 0-s column.
    for (int i = 1; i < physicalFileIncludes.length; ++i) {
        if (!physicalFileIncludes[i])
            continue;
        ColumnEncoding enc = encodings.get(i);
        colCtxs[i] = new ColumnReadContext(i, enc, indexes[i], ++colRgIx);
        if (isTracingEnabled) {
            LOG.trace("Creating context: " + colCtxs[i].toString());
        }
        trace.logColumnRead(i, colRgIx, enc.getKind());
    }
    CreateHelper listToRead = new CreateHelper();
    boolean hasIndexOnlyCols = false;
    for (OrcProto.Stream stream : streamList) {
        long length = stream.getLength();
        int colIx = stream.getColumn();
        OrcProto.Stream.Kind streamKind = stream.getKind();
        if (!physicalFileIncludes[colIx] || StreamName.getArea(streamKind) != StreamName.Area.DATA) {
            // We have a stream for included column, but in future it might have no data streams.
            // It's more like "has at least one column included that has an index stream".
            hasIndexOnlyCols = hasIndexOnlyCols || physicalFileIncludes[colIx];
            if (isTracingEnabled) {
                LOG.trace("Skipping stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length);
            }
            trace.logSkipStream(colIx, streamKind, offset, length);
            offset += length;
            continue;
        }
        ColumnReadContext ctx = colCtxs[colIx];
        assert ctx != null;
        int indexIx = RecordReaderUtils.getIndexPosition(ctx.encoding.getKind(), types.get(colIx).getKind(), streamKind, isCompressed, hasNull[colIx]);
        ctx.addStream(offset, stream, indexIx);
        if (isTracingEnabled) {
            LOG.trace("Adding stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length + ", index position " + indexIx);
        }
        if (rgs == null || RecordReaderUtils.isDictionary(streamKind, encodings.get(colIx))) {
            trace.logAddStream(colIx, streamKind, offset, length, indexIx, true);
            RecordReaderUtils.addEntireStreamToRanges(offset, length, listToRead, true);
            if (isTracingEnabled) {
                LOG.trace("Will read whole stream " + streamKind + "; added to " + listToRead.getTail());
            }
        } else {
            trace.logAddStream(colIx, streamKind, offset, length, indexIx, false);
            RecordReaderUtils.addRgFilteredStreamToRanges(stream, rgs, isCompressed, indexes[colIx], encodings.get(colIx), types.get(colIx), bufferSize, hasNull[colIx], offset, length, listToRead, true);
        }
        offset += length;
    }
    boolean hasFileId = this.fileKey != null;
    if (listToRead.get() == null) {
        // TODO: there may be a bug here. Could there be partial RG filtering on index-only column?
        if (hasIndexOnlyCols && (rgs == null)) {
            OrcEncodedColumnBatch ecb = POOLS.ecbPool.take();
            ecb.init(fileKey, stripeIx, OrcEncodedColumnBatch.ALL_RGS, physicalFileIncludes.length);
            try {
                consumer.consumeData(ecb);
            } catch (InterruptedException e) {
                LOG.error("IO thread interrupted while queueing data");
                throw new IOException(e);
            }
        } else {
            LOG.warn("Nothing to read for stripe [" + stripe + "]");
        }
        return;
    }
    // 2. Now, read all of the ranges from cache or disk.
    IdentityHashMap<ByteBuffer, Boolean> toRelease = new IdentityHashMap<>();
    MutateHelper toRead = getDataFromCacheAndDisk(listToRead.get(), stripeOffset, hasFileId, toRelease);
    // 3. For uncompressed case, we need some special processing before read.
    // Basically, we are trying to create artificial, consistent ranges to cache, as there are
    // no CBs in an uncompressed file. At the end of this processing, the list would contain
    // either cache buffers, or buffers allocated by us and not cached (if we are only reading
    // parts of the data for some ranges and don't want to cache it). Both are represented by
    // CacheChunks, so the list is just CacheChunk-s from that point on.
    DiskRangeList iter = preReadUncompressedStreams(stripeOffset, colCtxs, toRead, toRelease);
    // 4. Finally, decompress data, map per RG, and return to caller.
    // We go by RG and not by column because that is how data is processed.
    boolean hasError = true;
    try {
        int rgCount = (int) Math.ceil((double) stripe.getNumberOfRows() / rowIndexStride);
        for (int rgIx = 0; rgIx < rgCount; ++rgIx) {
            if (rgs != null && !rgs[rgIx]) {
                // RG filtered.
                continue;
            }
            boolean isLastRg = rgIx == rgCount - 1;
            // Create the batch we will use to return data for this RG.
            OrcEncodedColumnBatch ecb = POOLS.ecbPool.take();
            trace.logStartRg(rgIx);
            boolean hasErrorForEcb = true;
            try {
                ecb.init(fileKey, stripeIx, rgIx, physicalFileIncludes.length);
                for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
                    ColumnReadContext ctx = colCtxs[colIx];
                    // This column is not included.
                    if (ctx == null)
                        continue;
                    if (isTracingEnabled) {
                        LOG.trace("ctx: {} rgIx: {} isLastRg: {} rgCount: {}", ctx, rgIx, isLastRg, rgCount);
                    }
                    OrcProto.RowIndexEntry index = ctx.rowIndex.getEntry(rgIx), nextIndex = isLastRg ? null : ctx.rowIndex.getEntry(rgIx + 1);
                    ecb.initOrcColumn(ctx.colIx);
                    trace.logStartCol(ctx.colIx);
                    for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
                        StreamContext sctx = ctx.streams[streamIx];
                        ColumnStreamData cb = null;
                        try {
                            if (RecordReaderUtils.isDictionary(sctx.kind, ctx.encoding)) {
                                // This stream is for entire stripe and needed for every RG; uncompress once and reuse.
                                if (isTracingEnabled) {
                                    LOG.trace("Getting stripe-level stream [" + sctx.kind + ", " + ctx.encoding + "] for" + " column " + ctx.colIx + " RG " + rgIx + " at " + sctx.offset + ", " + sctx.length);
                                }
                                trace.logStartStripeStream(sctx.kind);
                                if (sctx.stripeLevelStream == null) {
                                    sctx.stripeLevelStream = POOLS.csdPool.take();
                                    // We will be using this for each RG while also sending RGs to processing.
                                    // To avoid buffers being unlocked, run refcount one ahead; so each RG
                                    // processing will decref once, and the last one will unlock the buffers.
                                    sctx.stripeLevelStream.incRef();
                                    // For stripe-level streams we don't need the extra refcount on the block.
                                    // See class comment about refcounts.
                                    long unlockUntilCOffset = sctx.offset + sctx.length;
                                    DiskRangeList lastCached = readEncodedStream(stripeOffset, iter, sctx.offset, sctx.offset + sctx.length, sctx.stripeLevelStream, unlockUntilCOffset, sctx.offset, toRelease);
                                    if (lastCached != null) {
                                        iter = lastCached;
                                    }
                                }
                                sctx.stripeLevelStream.incRef();
                                cb = sctx.stripeLevelStream;
                            } else {
                                // This stream can be separated by RG using index. Let's do that.
                                // Offset to where this RG begins.
                                long cOffset = sctx.offset + index.getPositions(sctx.streamIndexOffset);
                                // Offset relative to the beginning of the stream of where this RG ends.
                                long nextCOffsetRel = isLastRg ? sctx.length : nextIndex.getPositions(sctx.streamIndexOffset);
                                // Offset before which this RG is guaranteed to end. Can only be estimated.
                                // We estimate the same way for compressed and uncompressed for now.
                                long endCOffset = sctx.offset + RecordReaderUtils.estimateRgEndOffset(isCompressed, isLastRg, nextCOffsetRel, sctx.length, bufferSize);
                                // As we read, we can unlock initial refcounts for the buffers that end before
                                // the data that we need for this RG.
                                long unlockUntilCOffset = sctx.offset + nextCOffsetRel;
                                cb = createRgColumnStreamData(rgIx, isLastRg, ctx.colIx, sctx, cOffset, endCOffset, isCompressed, unlockUntilCOffset);
                                boolean isStartOfStream = sctx.bufferIter == null;
                                DiskRangeList lastCached = readEncodedStream(stripeOffset, (isStartOfStream ? iter : sctx.bufferIter), cOffset, endCOffset, cb, unlockUntilCOffset, sctx.offset, toRelease);
                                if (lastCached != null) {
                                    sctx.bufferIter = iter = lastCached;
                                }
                            }
                            ecb.setStreamData(ctx.colIx, sctx.kind.getNumber(), cb);
                        } catch (Exception ex) {
                            DiskRangeList drl = toRead == null ? null : toRead.next;
                            LOG.error("Error getting stream [" + sctx.kind + ", " + ctx.encoding + "] for" + " column " + ctx.colIx + " RG " + rgIx + " at " + sctx.offset + ", " + sctx.length + "; toRead " + RecordReaderUtils.stringifyDiskRanges(drl), ex);
                            throw (ex instanceof IOException) ? (IOException) ex : new IOException(ex);
                        }
                    }
                }
                hasErrorForEcb = false;
            } finally {
                if (hasErrorForEcb) {
                    releaseEcbRefCountsOnError(ecb);
                }
            }
            try {
                consumer.consumeData(ecb);
            // After this, the non-initial refcounts are the responsibility of the consumer.
            } catch (InterruptedException e) {
                LOG.error("IO thread interrupted while queueing data");
                releaseEcbRefCountsOnError(ecb);
                throw new IOException(e);
            }
        }
        if (isTracingEnabled) {
            LOG.trace("Disk ranges after preparing all the data " + RecordReaderUtils.stringifyDiskRanges(toRead.next));
        }
        trace.logRanges(fileKey, stripeOffset, toRead.next, RangesSrc.PREREAD);
        hasError = false;
    } finally {
        try {
            // Release the unreleased stripe-level buffers. See class comment about refcounts.
            for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
                ColumnReadContext ctx = colCtxs[colIx];
                // This column is not included.
                if (ctx == null)
                    continue;
                for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
                    StreamContext sctx = ctx.streams[streamIx];
                    if (sctx == null || sctx.stripeLevelStream == null)
                        continue;
                    if (0 != sctx.stripeLevelStream.decRef())
                        continue;
                    // essentially the "consumer" refcount being released here.
                    for (MemoryBuffer buf : sctx.stripeLevelStream.getCacheBuffers()) {
                        if (LOG.isTraceEnabled()) {
                            LOG.trace("Unlocking {} at the end of processing", buf);
                        }
                        cacheWrapper.releaseBuffer(buf);
                    }
                }
            }
            releaseInitialRefcounts(toRead.next);
            // Release buffers as we are done with all the streams... also see toRelease comment.
            releaseBuffers(toRelease.keySet(), true);
        } catch (Throwable t) {
            if (!hasError)
                throw new IOException(t);
            LOG.error("Error during the cleanup after another error; ignoring", t);
        }
    }
}
Also used : DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) OrcProto(org.apache.orc.OrcProto) IdentityHashMap(java.util.IdentityHashMap) CreateHelper(org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper) Stream(org.apache.orc.OrcProto.Stream) MutateHelper(org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper) Kind(org.apache.orc.OrcProto.Stream.Kind) OutStream(org.apache.orc.impl.OutStream) Stream(org.apache.orc.OrcProto.Stream) InStream(org.apache.orc.impl.InStream) CodedInputStream(com.google.protobuf.CodedInputStream) InputStream(java.io.InputStream) ColumnStreamData(org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData) IOException(java.io.IOException) ByteBuffer(java.nio.ByteBuffer) IOException(java.io.IOException) ColumnEncoding(org.apache.orc.OrcProto.ColumnEncoding) OrcEncodedColumnBatch(org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch) MemoryBuffer(org.apache.hadoop.hive.common.io.encoded.MemoryBuffer)

Aggregations

DiskRangeList (org.apache.hadoop.hive.common.io.DiskRangeList)3 OrcProto (org.apache.orc.OrcProto)3 Stream (org.apache.orc.OrcProto.Stream)3 CodedInputStream (com.google.protobuf.CodedInputStream)2 IOException (java.io.IOException)2 InputStream (java.io.InputStream)2 ByteBuffer (java.nio.ByteBuffer)2 IdentityHashMap (java.util.IdentityHashMap)2 CreateHelper (org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper)2 MutateHelper (org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper)2 ColumnStreamData (org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData)2 MemoryBuffer (org.apache.hadoop.hive.common.io.encoded.MemoryBuffer)2 Kind (org.apache.orc.OrcProto.Stream.Kind)2 InStream (org.apache.orc.impl.InStream)2 OutStream (org.apache.orc.impl.OutStream)2 OrcEncodedColumnBatch (org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch)1 ColumnEncoding (org.apache.orc.OrcProto.ColumnEncoding)1