Search in sources :

Example 21 with DiskRangeList

use of org.apache.hadoop.hive.common.io.DiskRangeList in project hive by apache.

the class EncodedReaderImpl method readIndexStreams.

@Override
public void readIndexStreams(OrcIndex index, StripeInformation stripe, List<OrcProto.Stream> streams, boolean[] physicalFileIncludes, boolean[] sargColumns) throws IOException {
    long stripeOffset = stripe.getOffset();
    DiskRangeList indexRanges = planIndexReading(fileSchema, streams, true, physicalFileIncludes, sargColumns, version, index.getBloomFilterKinds());
    if (indexRanges == null) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Nothing to read for stripe [" + stripe + "]");
        }
        return;
    }
    ReadContext[] colCtxs = new ReadContext[physicalFileIncludes.length];
    int colRgIx = -1;
    for (int i = 0; i < physicalFileIncludes.length; ++i) {
        if (!physicalFileIncludes[i] && (sargColumns == null || !sargColumns[i]))
            continue;
        colCtxs[i] = new ReadContext(i, ++colRgIx);
        if (isTracingEnabled) {
            LOG.trace("Creating context: " + colCtxs[i].toString());
        }
        // Bogus encoding.
        trace.logColumnRead(i, colRgIx, ColumnEncoding.Kind.DIRECT);
    }
    long offset = 0;
    for (OrcProto.Stream stream : streams) {
        long length = stream.getLength();
        int colIx = stream.getColumn();
        OrcProto.Stream.Kind streamKind = stream.getKind();
        // See planIndexReading - only read non-row-index streams if involved in SARGs.
        if ((StreamName.getArea(streamKind) == StreamName.Area.INDEX) && ((sargColumns != null && sargColumns[colIx]) || (physicalFileIncludes[colIx] && streamKind == Kind.ROW_INDEX))) {
            trace.logAddStream(colIx, streamKind, offset, length, -1, true);
            colCtxs[colIx].addStream(offset, stream, -1);
            if (isTracingEnabled) {
                LOG.trace("Adding stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length);
            }
        }
        offset += length;
    }
    boolean hasFileId = this.fileKey != null;
    // 2. Now, read all of the ranges from cache or disk.
    IdentityHashMap<ByteBuffer, Boolean> toRelease = new IdentityHashMap<>();
    MutateHelper toRead = getDataFromCacheAndDisk(indexRanges, stripeOffset, hasFileId, toRelease);
    // 3. For uncompressed case, we need some special processing before read.
    DiskRangeList iter = preReadUncompressedStreams(stripeOffset, colCtxs, toRead, toRelease);
    // 4. Decompress the data.
    boolean hasError = true;
    try {
        for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
            ReadContext ctx = colCtxs[colIx];
            // This column is not included.
            if (ctx == null)
                continue;
            for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
                StreamContext sctx = ctx.streams[streamIx];
                try {
                    if (isTracingEnabled) {
                        LOG.trace("Getting index stream " + sctx.kind + " for column " + ctx.colIx + " at " + sctx.offset + ", " + sctx.length);
                    }
                    ColumnStreamData csd = POOLS.csdPool.take();
                    long endCOffset = sctx.offset + sctx.length;
                    DiskRangeList lastCached = readEncodedStream(stripeOffset, iter, sctx.offset, endCOffset, csd, endCOffset, sctx.offset, toRelease);
                    if (lastCached != null) {
                        iter = lastCached;
                    }
                    if (isTracingEnabled) {
                        traceLogBuffersUsedToParse(csd);
                    }
                    CodedInputStream cis = CodedInputStream.newInstance(new IndexStream(csd.getCacheBuffers(), sctx.length));
                    cis.setSizeLimit(InStream.PROTOBUF_MESSAGE_MAX_LIMIT);
                    switch(sctx.kind) {
                        case ROW_INDEX:
                            OrcProto.RowIndex tmp = index.getRowGroupIndex()[colIx] = OrcProto.RowIndex.parseFrom(cis);
                            if (isTracingEnabled) {
                                LOG.trace("Index is " + tmp.toString().replace('\n', ' '));
                            }
                            break;
                        case BLOOM_FILTER:
                        case BLOOM_FILTER_UTF8:
                            index.getBloomFilterIndex()[colIx] = OrcProto.BloomFilterIndex.parseFrom(cis);
                            break;
                        default:
                            throw new AssertionError("Unexpected index stream type " + sctx.kind);
                    }
                    // We are done with the buffers; unlike data blocks, we are also the consumer. Release.
                    for (MemoryBuffer buf : csd.getCacheBuffers()) {
                        if (buf == null)
                            continue;
                        cacheWrapper.releaseBuffer(buf);
                    }
                } catch (Exception ex) {
                    DiskRangeList drl = toRead == null ? null : toRead.next;
                    LOG.error("Error getting stream " + sctx.kind + " for column " + ctx.colIx + " at " + sctx.offset + ", " + sctx.length + "; toRead " + RecordReaderUtils.stringifyDiskRanges(drl), ex);
                    throw (ex instanceof IOException) ? (IOException) ex : new IOException(ex);
                }
            }
        }
        if (isTracingEnabled) {
            LOG.trace("Disk ranges after preparing all the data " + RecordReaderUtils.stringifyDiskRanges(toRead.next));
        }
        hasError = false;
    } finally {
        // Release the unreleased buffers. See class comment about refcounts.
        try {
            releaseInitialRefcounts(toRead.next);
            releaseBuffers(toRelease.keySet(), true);
        } catch (Throwable t) {
            if (!hasError)
                throw new IOException(t);
            LOG.error("Error during the cleanup after another error; ignoring", t);
        }
    }
}
Also used : DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) CodedInputStream(com.google.protobuf.CodedInputStream) OrcProto(org.apache.orc.OrcProto) IdentityHashMap(java.util.IdentityHashMap) Stream(org.apache.orc.OrcProto.Stream) MutateHelper(org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper) Kind(org.apache.orc.OrcProto.Stream.Kind) OutStream(org.apache.orc.impl.OutStream) Stream(org.apache.orc.OrcProto.Stream) InStream(org.apache.orc.impl.InStream) CodedInputStream(com.google.protobuf.CodedInputStream) InputStream(java.io.InputStream) ColumnStreamData(org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData) IOException(java.io.IOException) ByteBuffer(java.nio.ByteBuffer) IOException(java.io.IOException) MemoryBuffer(org.apache.hadoop.hive.common.io.encoded.MemoryBuffer)

Example 22 with DiskRangeList

use of org.apache.hadoop.hive.common.io.DiskRangeList in project hive by apache.

the class EncodedReaderImpl method planIndexReading.

// TODO: temporary, need to expose from ORC utils (note the difference in null checks)
static DiskRangeList planIndexReading(TypeDescription fileSchema, List<OrcProto.Stream> streams, boolean ignoreNonUtf8BloomFilter, boolean[] fileIncluded, boolean[] sargColumns, WriterVersion version, OrcProto.Stream.Kind[] bloomFilterKinds) {
    DiskRangeList.CreateHelper result = new DiskRangeList.CreateHelper();
    // picks bloom_filter_utf8 if its available, otherwise bloom_filter
    if (sargColumns != null) {
        for (OrcProto.Stream stream : streams) {
            if (stream.hasKind() && stream.hasColumn()) {
                int column = stream.getColumn();
                if (sargColumns[column]) {
                    switch(stream.getKind()) {
                        case BLOOM_FILTER:
                            if (bloomFilterKinds[column] == null && !(ignoreNonUtf8BloomFilter && hadBadBloomFilters(fileSchema.findSubtype(column).getCategory(), version))) {
                                bloomFilterKinds[column] = OrcProto.Stream.Kind.BLOOM_FILTER;
                            }
                            break;
                        case BLOOM_FILTER_UTF8:
                            bloomFilterKinds[column] = OrcProto.Stream.Kind.BLOOM_FILTER_UTF8;
                            break;
                        default:
                            break;
                    }
                }
            }
        }
    }
    long offset = 0;
    for (OrcProto.Stream stream : streams) {
        if (stream.hasKind() && stream.hasColumn()) {
            int column = stream.getColumn();
            if (fileIncluded == null || fileIncluded[column]) {
                boolean needStream = false;
                switch(stream.getKind()) {
                    case ROW_INDEX:
                        needStream = true;
                        break;
                    case BLOOM_FILTER:
                    case BLOOM_FILTER_UTF8:
                        needStream = (sargColumns != null) && (bloomFilterKinds[column] == stream.getKind());
                        break;
                    default:
                        // PASS
                        break;
                }
                if (needStream) {
                    result.addOrMerge(offset, offset + stream.getLength(), true, false);
                }
            }
        }
        offset += stream.getLength();
    }
    return result.get();
}
Also used : CreateHelper(org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper) Stream(org.apache.orc.OrcProto.Stream) DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) CreateHelper(org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper) OrcProto(org.apache.orc.OrcProto)

Example 23 with DiskRangeList

use of org.apache.hadoop.hive.common.io.DiskRangeList in project hive by apache.

the class EncodedReaderImpl method readEncodedColumns.

@Override
public void readEncodedColumns(int stripeIx, StripeInformation stripe, OrcProto.RowIndex[] indexes, List<OrcProto.ColumnEncoding> encodings, List<OrcProto.Stream> streamList, boolean[] physicalFileIncludes, boolean[] rgs, Consumer<OrcEncodedColumnBatch> consumer) throws IOException {
    // Note: for now we don't have to setError here, caller will setError if we throw.
    // We are also not supposed to call setDone, since we are only part of the operation.
    long stripeOffset = stripe.getOffset();
    // 1. Figure out what we have to read.
    // Stream offset in relation to the stripe.
    long offset = 0;
    // 1.1. Figure out which columns have a present stream
    boolean[] hasNull = RecordReaderUtils.findPresentStreamsByColumn(streamList, types);
    if (isTracingEnabled) {
        LOG.trace("The following columns have PRESENT streams: " + arrayToString(hasNull));
    }
    // We assume stream list is sorted by column and that non-data
    // streams do not interleave data streams for the same column.
    // 1.2. With that in mind, determine disk ranges to read/get from cache (not by stream).
    ColumnReadContext[] colCtxs = new ColumnReadContext[physicalFileIncludes.length];
    int colRgIx = -1;
    // Don't create context for the 0-s column.
    for (int i = 1; i < physicalFileIncludes.length; ++i) {
        if (!physicalFileIncludes[i])
            continue;
        ColumnEncoding enc = encodings.get(i);
        colCtxs[i] = new ColumnReadContext(i, enc, indexes[i], ++colRgIx);
        if (isTracingEnabled) {
            LOG.trace("Creating context: " + colCtxs[i].toString());
        }
        trace.logColumnRead(i, colRgIx, enc.getKind());
    }
    CreateHelper listToRead = new CreateHelper();
    boolean hasIndexOnlyCols = false;
    for (OrcProto.Stream stream : streamList) {
        long length = stream.getLength();
        int colIx = stream.getColumn();
        OrcProto.Stream.Kind streamKind = stream.getKind();
        if (!physicalFileIncludes[colIx] || StreamName.getArea(streamKind) != StreamName.Area.DATA) {
            // We have a stream for included column, but in future it might have no data streams.
            // It's more like "has at least one column included that has an index stream".
            hasIndexOnlyCols = hasIndexOnlyCols || physicalFileIncludes[colIx];
            if (isTracingEnabled) {
                LOG.trace("Skipping stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length);
            }
            trace.logSkipStream(colIx, streamKind, offset, length);
            offset += length;
            continue;
        }
        ColumnReadContext ctx = colCtxs[colIx];
        assert ctx != null;
        int indexIx = RecordReaderUtils.getIndexPosition(ctx.encoding.getKind(), types.get(colIx).getKind(), streamKind, isCompressed, hasNull[colIx]);
        ctx.addStream(offset, stream, indexIx);
        if (isTracingEnabled) {
            LOG.trace("Adding stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length + ", index position " + indexIx);
        }
        if (rgs == null || RecordReaderUtils.isDictionary(streamKind, encodings.get(colIx))) {
            trace.logAddStream(colIx, streamKind, offset, length, indexIx, true);
            RecordReaderUtils.addEntireStreamToRanges(offset, length, listToRead, true);
            if (isTracingEnabled) {
                LOG.trace("Will read whole stream " + streamKind + "; added to " + listToRead.getTail());
            }
        } else {
            trace.logAddStream(colIx, streamKind, offset, length, indexIx, false);
            RecordReaderUtils.addRgFilteredStreamToRanges(stream, rgs, isCompressed, indexes[colIx], encodings.get(colIx), types.get(colIx), bufferSize, hasNull[colIx], offset, length, listToRead, true);
        }
        offset += length;
    }
    boolean hasFileId = this.fileKey != null;
    if (listToRead.get() == null) {
        // TODO: there may be a bug here. Could there be partial RG filtering on index-only column?
        if (hasIndexOnlyCols && (rgs == null)) {
            OrcEncodedColumnBatch ecb = POOLS.ecbPool.take();
            ecb.init(fileKey, stripeIx, OrcEncodedColumnBatch.ALL_RGS, physicalFileIncludes.length);
            try {
                consumer.consumeData(ecb);
            } catch (InterruptedException e) {
                LOG.error("IO thread interrupted while queueing data");
                throw new IOException(e);
            }
        } else {
            LOG.warn("Nothing to read for stripe [" + stripe + "]");
        }
        return;
    }
    // 2. Now, read all of the ranges from cache or disk.
    IdentityHashMap<ByteBuffer, Boolean> toRelease = new IdentityHashMap<>();
    MutateHelper toRead = getDataFromCacheAndDisk(listToRead.get(), stripeOffset, hasFileId, toRelease);
    // 3. For uncompressed case, we need some special processing before read.
    // Basically, we are trying to create artificial, consistent ranges to cache, as there are
    // no CBs in an uncompressed file. At the end of this processing, the list would contain
    // either cache buffers, or buffers allocated by us and not cached (if we are only reading
    // parts of the data for some ranges and don't want to cache it). Both are represented by
    // CacheChunks, so the list is just CacheChunk-s from that point on.
    DiskRangeList iter = preReadUncompressedStreams(stripeOffset, colCtxs, toRead, toRelease);
    // 4. Finally, decompress data, map per RG, and return to caller.
    // We go by RG and not by column because that is how data is processed.
    boolean hasError = true;
    try {
        int rgCount = (int) Math.ceil((double) stripe.getNumberOfRows() / rowIndexStride);
        for (int rgIx = 0; rgIx < rgCount; ++rgIx) {
            if (rgs != null && !rgs[rgIx]) {
                // RG filtered.
                continue;
            }
            boolean isLastRg = rgIx == rgCount - 1;
            // Create the batch we will use to return data for this RG.
            OrcEncodedColumnBatch ecb = POOLS.ecbPool.take();
            trace.logStartRg(rgIx);
            boolean hasErrorForEcb = true;
            try {
                ecb.init(fileKey, stripeIx, rgIx, physicalFileIncludes.length);
                for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
                    ColumnReadContext ctx = colCtxs[colIx];
                    // This column is not included.
                    if (ctx == null)
                        continue;
                    if (isTracingEnabled) {
                        LOG.trace("ctx: {} rgIx: {} isLastRg: {} rgCount: {}", ctx, rgIx, isLastRg, rgCount);
                    }
                    OrcProto.RowIndexEntry index = ctx.rowIndex.getEntry(rgIx), nextIndex = isLastRg ? null : ctx.rowIndex.getEntry(rgIx + 1);
                    ecb.initOrcColumn(ctx.colIx);
                    trace.logStartCol(ctx.colIx);
                    for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
                        StreamContext sctx = ctx.streams[streamIx];
                        ColumnStreamData cb = null;
                        try {
                            if (RecordReaderUtils.isDictionary(sctx.kind, ctx.encoding)) {
                                // This stream is for entire stripe and needed for every RG; uncompress once and reuse.
                                if (isTracingEnabled) {
                                    LOG.trace("Getting stripe-level stream [" + sctx.kind + ", " + ctx.encoding + "] for" + " column " + ctx.colIx + " RG " + rgIx + " at " + sctx.offset + ", " + sctx.length);
                                }
                                trace.logStartStripeStream(sctx.kind);
                                if (sctx.stripeLevelStream == null) {
                                    sctx.stripeLevelStream = POOLS.csdPool.take();
                                    // We will be using this for each RG while also sending RGs to processing.
                                    // To avoid buffers being unlocked, run refcount one ahead; so each RG
                                    // processing will decref once, and the last one will unlock the buffers.
                                    sctx.stripeLevelStream.incRef();
                                    // For stripe-level streams we don't need the extra refcount on the block.
                                    // See class comment about refcounts.
                                    long unlockUntilCOffset = sctx.offset + sctx.length;
                                    DiskRangeList lastCached = readEncodedStream(stripeOffset, iter, sctx.offset, sctx.offset + sctx.length, sctx.stripeLevelStream, unlockUntilCOffset, sctx.offset, toRelease);
                                    if (lastCached != null) {
                                        iter = lastCached;
                                    }
                                }
                                sctx.stripeLevelStream.incRef();
                                cb = sctx.stripeLevelStream;
                            } else {
                                // This stream can be separated by RG using index. Let's do that.
                                // Offset to where this RG begins.
                                long cOffset = sctx.offset + index.getPositions(sctx.streamIndexOffset);
                                // Offset relative to the beginning of the stream of where this RG ends.
                                long nextCOffsetRel = isLastRg ? sctx.length : nextIndex.getPositions(sctx.streamIndexOffset);
                                // Offset before which this RG is guaranteed to end. Can only be estimated.
                                // We estimate the same way for compressed and uncompressed for now.
                                long endCOffset = sctx.offset + RecordReaderUtils.estimateRgEndOffset(isCompressed, isLastRg, nextCOffsetRel, sctx.length, bufferSize);
                                // As we read, we can unlock initial refcounts for the buffers that end before
                                // the data that we need for this RG.
                                long unlockUntilCOffset = sctx.offset + nextCOffsetRel;
                                cb = createRgColumnStreamData(rgIx, isLastRg, ctx.colIx, sctx, cOffset, endCOffset, isCompressed, unlockUntilCOffset);
                                boolean isStartOfStream = sctx.bufferIter == null;
                                DiskRangeList lastCached = readEncodedStream(stripeOffset, (isStartOfStream ? iter : sctx.bufferIter), cOffset, endCOffset, cb, unlockUntilCOffset, sctx.offset, toRelease);
                                if (lastCached != null) {
                                    sctx.bufferIter = iter = lastCached;
                                }
                            }
                            ecb.setStreamData(ctx.colIx, sctx.kind.getNumber(), cb);
                        } catch (Exception ex) {
                            DiskRangeList drl = toRead == null ? null : toRead.next;
                            LOG.error("Error getting stream [" + sctx.kind + ", " + ctx.encoding + "] for" + " column " + ctx.colIx + " RG " + rgIx + " at " + sctx.offset + ", " + sctx.length + "; toRead " + RecordReaderUtils.stringifyDiskRanges(drl), ex);
                            throw (ex instanceof IOException) ? (IOException) ex : new IOException(ex);
                        }
                    }
                }
                hasErrorForEcb = false;
            } finally {
                if (hasErrorForEcb) {
                    releaseEcbRefCountsOnError(ecb);
                }
            }
            try {
                consumer.consumeData(ecb);
            // After this, the non-initial refcounts are the responsibility of the consumer.
            } catch (InterruptedException e) {
                LOG.error("IO thread interrupted while queueing data");
                releaseEcbRefCountsOnError(ecb);
                throw new IOException(e);
            }
        }
        if (isTracingEnabled) {
            LOG.trace("Disk ranges after preparing all the data " + RecordReaderUtils.stringifyDiskRanges(toRead.next));
        }
        trace.logRanges(fileKey, stripeOffset, toRead.next, RangesSrc.PREREAD);
        hasError = false;
    } finally {
        try {
            // Release the unreleased stripe-level buffers. See class comment about refcounts.
            for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
                ColumnReadContext ctx = colCtxs[colIx];
                // This column is not included.
                if (ctx == null)
                    continue;
                for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
                    StreamContext sctx = ctx.streams[streamIx];
                    if (sctx == null || sctx.stripeLevelStream == null)
                        continue;
                    if (0 != sctx.stripeLevelStream.decRef())
                        continue;
                    // essentially the "consumer" refcount being released here.
                    for (MemoryBuffer buf : sctx.stripeLevelStream.getCacheBuffers()) {
                        if (LOG.isTraceEnabled()) {
                            LOG.trace("Unlocking {} at the end of processing", buf);
                        }
                        cacheWrapper.releaseBuffer(buf);
                    }
                }
            }
            releaseInitialRefcounts(toRead.next);
            // Release buffers as we are done with all the streams... also see toRelease comment.
            releaseBuffers(toRelease.keySet(), true);
        } catch (Throwable t) {
            if (!hasError)
                throw new IOException(t);
            LOG.error("Error during the cleanup after another error; ignoring", t);
        }
    }
}
Also used : DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) OrcProto(org.apache.orc.OrcProto) IdentityHashMap(java.util.IdentityHashMap) CreateHelper(org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper) Stream(org.apache.orc.OrcProto.Stream) MutateHelper(org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper) Kind(org.apache.orc.OrcProto.Stream.Kind) OutStream(org.apache.orc.impl.OutStream) Stream(org.apache.orc.OrcProto.Stream) InStream(org.apache.orc.impl.InStream) CodedInputStream(com.google.protobuf.CodedInputStream) InputStream(java.io.InputStream) ColumnStreamData(org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData) IOException(java.io.IOException) ByteBuffer(java.nio.ByteBuffer) IOException(java.io.IOException) ColumnEncoding(org.apache.orc.OrcProto.ColumnEncoding) OrcEncodedColumnBatch(org.apache.hadoop.hive.ql.io.orc.encoded.Reader.OrcEncodedColumnBatch) MemoryBuffer(org.apache.hadoop.hive.common.io.encoded.MemoryBuffer)

Example 24 with DiskRangeList

use of org.apache.hadoop.hive.common.io.DiskRangeList in project hive by apache.

the class EncodedReaderImpl method readLengthBytesFromSmallBuffers.

@VisibleForTesting
static BufferChunk readLengthBytesFromSmallBuffers(BufferChunk first, long cbStartOffset, int[] result, List<IncompleteCb> badEstimates, boolean isTracingEnabled, IoTrace trace) throws IOException {
    if (!first.hasContiguousNext()) {
        badEstimates.add(addIncompleteCompressionBuffer(cbStartOffset, first, 0, isTracingEnabled, trace));
        // This is impossible to read from this chunk.
        return null;
    }
    int ix = readLengthBytes(first.getChunk(), result, 0);
    // Otherwise we wouldn't be here.
    assert ix < 3;
    DiskRangeList current = first.next;
    first.removeSelf();
    while (true) {
        if (!(current instanceof BufferChunk)) {
            throw new IOException("Trying to extend compressed block into uncompressed block " + current);
        }
        BufferChunk currentBc = (BufferChunk) current;
        ix = readLengthBytes(currentBc.getChunk(), result, ix);
        // Done, we have 3 bytes. Continue reading this buffer.
        if (ix == 3)
            return currentBc;
        DiskRangeList tmp = current;
        current = current.hasContiguousNext() ? current.next : null;
        if (current != null) {
            if (isTracingEnabled) {
                LOG.trace("Removing partial CB " + tmp + " from ranges after copying its contents");
            }
            trace.logPartialCb(tmp);
            tmp.removeSelf();
        } else {
            badEstimates.add(addIncompleteCompressionBuffer(cbStartOffset, tmp, -1, isTracingEnabled, trace));
            // This is impossible to read from this chunk.
            return null;
        }
    }
}
Also used : DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) IOException(java.io.IOException) BufferChunk(org.apache.orc.impl.BufferChunk) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 25 with DiskRangeList

use of org.apache.hadoop.hive.common.io.DiskRangeList in project hive by apache.

the class EncodedReaderImpl method addOneCompressionBuffer.

/**
 * Reads one compression block from the source; handles compression blocks read from
 * multiple ranges (usually, that would only happen with zcr).
 * Adds stuff to cachedBuffers, toDecompress and toRelease (see below what each does).
 * @param current BufferChunk where compression block starts.
 * @param cacheBuffers The result buffer array to add pre-allocated target cache buffer.
 * @param toDecompress The list of work to decompress - pairs of compressed buffers and the
 *                     target buffers (same as the ones added to cacheBuffers).
 * @param toRelease The list of buffers to release to zcr because they are no longer in use.
 * @param badEstimates The list of bad estimates that cannot be decompressed.
 * @return The resulting cache chunk.
 */
private ProcCacheChunk addOneCompressionBuffer(BufferChunk current, List<MemoryBuffer> cacheBuffers, List<ProcCacheChunk> toDecompress, IdentityHashMap<ByteBuffer, Boolean> toRelease, List<ByteBuffer> toReleaseCopies, List<IncompleteCb> badEstimates) throws IOException {
    ByteBuffer slice = null;
    ByteBuffer compressed = current.getChunk();
    long cbStartOffset = current.getOffset();
    int b0 = -1, b1 = -1, b2 = -1;
    // First, read the CB header. Due to ORC estimates, ZCR, etc. this can be complex.
    if (compressed.remaining() >= 3) {
        // The overwhelming majority of cases will go here. Read 3 bytes. Tada!
        b0 = compressed.get() & 0xff;
        b1 = compressed.get() & 0xff;
        b2 = compressed.get() & 0xff;
    } else {
        // Bad luck! Handle the corner cases where 3 bytes are in multiple blocks.
        int[] bytes = new int[3];
        current = readLengthBytesFromSmallBuffers(current, cbStartOffset, bytes, badEstimates, isTracingEnabled, trace);
        if (current == null)
            return null;
        compressed = current.getChunk();
        b0 = bytes[0];
        b1 = bytes[1];
        b2 = bytes[2];
    }
    int chunkLength = (b2 << 15) | (b1 << 7) | (b0 >> 1);
    if (chunkLength > bufferSize) {
        throw new IllegalArgumentException("Buffer size too small. size = " + bufferSize + " needed = " + chunkLength);
    }
    int consumedLength = chunkLength + OutStream.HEADER_SIZE;
    long cbEndOffset = cbStartOffset + consumedLength;
    boolean isUncompressed = ((b0 & 0x01) == 1);
    if (isTracingEnabled) {
        LOG.trace("Found CB at " + cbStartOffset + ", chunk length " + chunkLength + ", total " + consumedLength + ", " + (isUncompressed ? "not " : "") + "compressed");
    }
    trace.logOrcCb(cbStartOffset, chunkLength, isUncompressed);
    if (compressed.remaining() >= chunkLength) {
        // Simple case - CB fits entirely in the disk range.
        slice = compressed.slice();
        slice.limit(chunkLength);
        return addOneCompressionBlockByteBuffer(slice, isUncompressed, cbStartOffset, cbEndOffset, chunkLength, current, toDecompress, cacheBuffers, false);
    }
    if (current.getEnd() < cbEndOffset && !current.hasContiguousNext()) {
        badEstimates.add(addIncompleteCompressionBuffer(cbStartOffset, current, 0, isTracingEnabled, trace));
        // This is impossible to read from this chunk.
        return null;
    }
    // TODO: we could remove extra copy for isUncompressed case by copying directly to cache.
    // We need to consolidate 2 or more buffers into one to decompress.
    ByteBuffer copy = allocateBuffer(chunkLength, compressed.isDirect());
    // We will always release copies at the end.
    toReleaseCopies.add(copy);
    int remaining = chunkLength - compressed.remaining();
    int originalPos = compressed.position();
    copy.put(compressed);
    if (isTracingEnabled) {
        LOG.trace("Removing partial CB " + current + " from ranges after copying its contents");
    }
    trace.logPartialCb(current);
    DiskRangeList next = current.next;
    current.removeSelf();
    if (originalPos == 0 && toRelease.remove(compressed)) {
        releaseBuffer(compressed, true);
    }
    int extraChunkCount = 0;
    while (true) {
        if (!(next instanceof BufferChunk)) {
            throw new IOException("Trying to extend compressed block into uncompressed block " + next);
        }
        compressed = next.getData();
        ++extraChunkCount;
        if (compressed.remaining() >= remaining) {
            // This is the last range for this compression block. Yay!
            slice = compressed.slice();
            slice.limit(remaining);
            copy.put(slice);
            ProcCacheChunk cc = addOneCompressionBlockByteBuffer(copy, isUncompressed, cbStartOffset, cbEndOffset, remaining, (BufferChunk) next, toDecompress, cacheBuffers, true);
            if (compressed.remaining() <= 0 && toRelease.remove(compressed)) {
                // We copied the entire buffer.
                releaseBuffer(compressed, true);
            }
            // else there's more data to process; will be handled in next call.
            return cc;
        }
        remaining -= compressed.remaining();
        // TODO: move into the if below; account for release call
        copy.put(compressed);
        if (toRelease.remove(compressed)) {
            // We copied the entire buffer.
            releaseBuffer(compressed, true);
        }
        DiskRangeList tmp = next;
        next = next.hasContiguousNext() ? next.next : null;
        if (next != null) {
            if (isTracingEnabled) {
                LOG.trace("Removing partial CB " + tmp + " from ranges after copying its contents");
            }
            trace.logPartialCb(tmp);
            tmp.removeSelf();
        } else {
            badEstimates.add(addIncompleteCompressionBuffer(cbStartOffset, tmp, extraChunkCount, isTracingEnabled, trace));
            // This is impossible to read from this chunk.
            return null;
        }
    }
}
Also used : DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) IOException(java.io.IOException) BufferChunk(org.apache.orc.impl.BufferChunk) ByteBuffer(java.nio.ByteBuffer)

Aggregations

DiskRangeList (org.apache.hadoop.hive.common.io.DiskRangeList)29 IOException (java.io.IOException)11 BufferChunk (org.apache.orc.impl.BufferChunk)11 MemoryBuffer (org.apache.hadoop.hive.common.io.encoded.MemoryBuffer)10 ByteBuffer (java.nio.ByteBuffer)9 DiskRange (org.apache.hadoop.hive.common.io.DiskRange)6 MutateHelper (org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper)6 CreateHelper (org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper)5 OrcProto (org.apache.orc.OrcProto)5 ColumnStreamData (org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData)3 Stream (org.apache.orc.OrcProto.Stream)3 OutStream (org.apache.orc.impl.OutStream)3 CodedInputStream (com.google.protobuf.CodedInputStream)2 InputStream (java.io.InputStream)2 ArrayList (java.util.ArrayList)2 IdentityHashMap (java.util.IdentityHashMap)2 Map (java.util.Map)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 ConcurrentSkipListMap (java.util.concurrent.ConcurrentSkipListMap)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2