Search in sources :

Example 1 with BufferChunk

use of org.apache.orc.impl.BufferChunk in project hive by apache.

the class StreamUtils method createDiskRangeInfo.

/**
   * Converts stream buffers to disk ranges.
   * @param streamBuffer - stream buffer
   * @return - total length of disk ranges
   */
public static DiskRangeInfo createDiskRangeInfo(ColumnStreamData streamBuffer) {
    DiskRangeInfo diskRangeInfo = new DiskRangeInfo(streamBuffer.getIndexBaseOffset());
    // See ctor comment.
    long offset = streamBuffer.getIndexBaseOffset();
    // TODO: we should get rid of this
    for (MemoryBuffer memoryBuffer : streamBuffer.getCacheBuffers()) {
        ByteBuffer buffer = memoryBuffer.getByteBufferDup();
        diskRangeInfo.addDiskRange(new BufferChunk(buffer, offset));
        offset += buffer.remaining();
    }
    return diskRangeInfo;
}
Also used : MemoryBuffer(org.apache.hadoop.hive.common.io.encoded.MemoryBuffer) DiskRangeInfo(org.apache.hadoop.hive.common.DiskRangeInfo) BufferChunk(org.apache.orc.impl.BufferChunk) ByteBuffer(java.nio.ByteBuffer)

Example 2 with BufferChunk

use of org.apache.orc.impl.BufferChunk in project hive by apache.

the class EncodedReaderImpl method addOneCompressionBuffer.

/**
   * Reads one compression block from the source; handles compression blocks read from
   * multiple ranges (usually, that would only happen with zcr).
   * Adds stuff to cachedBuffers, toDecompress and toRelease (see below what each does).
   * @param current BufferChunk where compression block starts.
   * @param cacheBuffers The result buffer array to add pre-allocated target cache buffer.
   * @param toDecompress The list of work to decompress - pairs of compressed buffers and the
   *                     target buffers (same as the ones added to cacheBuffers).
   * @param toRelease The list of buffers to release to zcr because they are no longer in use.
   * @param badEstimates The list of bad estimates that cannot be decompressed.
   * @return The resulting cache chunk.
   */
private ProcCacheChunk addOneCompressionBuffer(BufferChunk current, List<MemoryBuffer> cacheBuffers, List<ProcCacheChunk> toDecompress, List<ByteBuffer> toRelease, List<IncompleteCb> badEstimates) throws IOException {
    ByteBuffer slice = null;
    ByteBuffer compressed = current.getChunk();
    long cbStartOffset = current.getOffset();
    int b0 = compressed.get() & 0xff;
    int b1 = compressed.get() & 0xff;
    int b2 = compressed.get() & 0xff;
    int chunkLength = (b2 << 15) | (b1 << 7) | (b0 >> 1);
    if (chunkLength > bufferSize) {
        throw new IllegalArgumentException("Buffer size too small. size = " + bufferSize + " needed = " + chunkLength);
    }
    int consumedLength = chunkLength + OutStream.HEADER_SIZE;
    long cbEndOffset = cbStartOffset + consumedLength;
    boolean isUncompressed = ((b0 & 0x01) == 1);
    if (isTracingEnabled) {
        LOG.trace("Found CB at " + cbStartOffset + ", chunk length " + chunkLength + ", total " + consumedLength + ", " + (isUncompressed ? "not " : "") + "compressed");
    }
    if (compressed.remaining() >= chunkLength) {
        // Simple case - CB fits entirely in the disk range.
        slice = compressed.slice();
        slice.limit(chunkLength);
        ProcCacheChunk cc = addOneCompressionBlockByteBuffer(slice, isUncompressed, cbStartOffset, cbEndOffset, chunkLength, current, toDecompress, cacheBuffers);
        if (compressed.remaining() <= 0 && dataReader.isTrackingDiskRanges()) {
            toRelease.add(compressed);
        }
        return cc;
    }
    if (current.getEnd() < cbEndOffset && !current.hasContiguousNext()) {
        badEstimates.add(addIncompleteCompressionBuffer(cbStartOffset, current, 0));
        // This is impossible to read from this chunk.
        return null;
    }
    // TODO: we could remove extra copy for isUncompressed case by copying directly to cache.
    // We need to consolidate 2 or more buffers into one to decompress.
    ByteBuffer copy = allocateBuffer(chunkLength, compressed.isDirect());
    int remaining = chunkLength - compressed.remaining();
    int originalPos = compressed.position();
    copy.put(compressed);
    if (isTracingEnabled) {
        LOG.trace("Removing partial CB " + current + " from ranges after copying its contents");
    }
    DiskRangeList next = current.next;
    current.removeSelf();
    if (dataReader.isTrackingDiskRanges()) {
        if (originalPos == 0) {
            // We copied the entire buffer.
            dataReader.releaseBuffer(compressed);
        } else {
            // There might be slices depending on this buffer.
            toRelease.add(compressed);
        }
    }
    int extraChunkCount = 0;
    while (true) {
        if (!(next instanceof BufferChunk)) {
            throw new IOException("Trying to extend compressed block into uncompressed block " + next);
        }
        compressed = next.getData();
        ++extraChunkCount;
        if (compressed.remaining() >= remaining) {
            // This is the last range for this compression block. Yay!
            slice = compressed.slice();
            slice.limit(remaining);
            copy.put(slice);
            ProcCacheChunk cc = addOneCompressionBlockByteBuffer(copy, isUncompressed, cbStartOffset, cbEndOffset, remaining, (BufferChunk) next, toDecompress, cacheBuffers);
            if (compressed.remaining() <= 0 && dataReader.isTrackingDiskRanges()) {
                // We copied the entire buffer.
                dataReader.releaseBuffer(compressed);
            }
            return cc;
        }
        remaining -= compressed.remaining();
        copy.put(compressed);
        if (dataReader.isTrackingDiskRanges()) {
            // We copied the entire buffer.
            dataReader.releaseBuffer(compressed);
        }
        DiskRangeList tmp = next;
        next = next.hasContiguousNext() ? next.next : null;
        if (next != null) {
            if (isTracingEnabled) {
                LOG.trace("Removing partial CB " + tmp + " from ranges after copying its contents");
            }
            tmp.removeSelf();
        } else {
            badEstimates.add(addIncompleteCompressionBuffer(cbStartOffset, tmp, extraChunkCount));
            // This is impossible to read from this chunk.
            return null;
        }
    }
}
Also used : DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) IOException(java.io.IOException) BufferChunk(org.apache.orc.impl.BufferChunk) ByteBuffer(java.nio.ByteBuffer)

Example 3 with BufferChunk

use of org.apache.orc.impl.BufferChunk in project hive by apache.

the class EncodedReaderImpl method prepareRangesForCompressedRead.

private CacheChunk prepareRangesForCompressedRead(long cOffset, long endCOffset, long streamOffset, long unlockUntilCOffset, DiskRangeList current, ColumnStreamData columnStreamData, List<ByteBuffer> toRelease, List<ProcCacheChunk> toDecompress, List<IncompleteCb> badEstimates) throws IOException {
    if (cOffset > current.getOffset()) {
        // Target compression block is in the middle of the range; slice the range in two.
        current = current.split(cOffset).next;
    }
    long currentOffset = cOffset;
    CacheChunk lastUncompressed = null;
    while (true) {
        DiskRangeList next = null;
        if (current instanceof CacheChunk) {
            // 2a. This is a decoded compression buffer, add as is.
            CacheChunk cc = (CacheChunk) current;
            if (isTracingEnabled) {
                LOG.trace("Locking " + cc.getBuffer() + " due to reuse");
            }
            cacheWrapper.reuseBuffer(cc.getBuffer());
            columnStreamData.getCacheBuffers().add(cc.getBuffer());
            currentOffset = cc.getEnd();
            if (isTracingEnabled) {
                LOG.trace("Adding an already-uncompressed buffer " + cc.getBuffer());
            }
            ponderReleaseInitialRefcount(unlockUntilCOffset, streamOffset, cc);
            lastUncompressed = cc;
            next = current.next;
            if (next != null && (endCOffset >= 0 && currentOffset < endCOffset) && next.getOffset() >= endCOffset) {
                throw new IOException("Expected data at " + currentOffset + " (reading until " + endCOffset + "), but the next buffer starts at " + next.getOffset());
            }
        } else if (current instanceof IncompleteCb) {
            // 2b. This is a known incomplete CB caused by ORC CB end boundaries being estimates.
            if (isTracingEnabled) {
                LOG.trace("Cannot read " + current);
            }
            next = null;
            currentOffset = -1;
        } else {
            // several disk ranges, so we might need to combine them.
            if (!(current instanceof BufferChunk)) {
                String msg = "Found an unexpected " + current.getClass().getSimpleName() + ": " + current + " while looking at " + currentOffset;
                LOG.error(msg);
                throw new RuntimeException(msg);
            }
            BufferChunk bc = (BufferChunk) current;
            ProcCacheChunk newCached = addOneCompressionBuffer(bc, columnStreamData.getCacheBuffers(), toDecompress, toRelease, badEstimates);
            lastUncompressed = (newCached == null) ? lastUncompressed : newCached;
            next = (newCached != null) ? newCached.next : null;
            currentOffset = (next != null) ? next.getOffset() : -1;
        }
        if (next == null || (endCOffset >= 0 && currentOffset >= endCOffset)) {
            break;
        }
        current = next;
    }
    return lastUncompressed;
}
Also used : DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) IOException(java.io.IOException) BufferChunk(org.apache.orc.impl.BufferChunk)

Example 4 with BufferChunk

use of org.apache.orc.impl.BufferChunk in project hive by apache.

the class EncodedReaderImpl method copyAndReplaceUncompressedChunks.

private static void copyAndReplaceUncompressedChunks(UncompressedCacheChunk candidateCached, ByteBuffer dest, CacheChunk tcc) {
    int startPos = dest.position(), startLim = dest.limit();
    DiskRangeList next = null;
    for (int i = 0; i < candidateCached.getCount(); ++i) {
        BufferChunk chunk = (i == 0) ? candidateCached.getChunk() : (BufferChunk) next;
        dest.put(chunk.getData());
        next = chunk.next;
        if (i == 0) {
            chunk.replaceSelfWith(tcc);
        } else {
            chunk.removeSelf();
        }
    }
    int newPos = dest.position();
    if (newPos > startLim) {
        throw new AssertionError("After copying, buffer [" + startPos + ", " + startLim + ") became [" + newPos + ", " + dest.limit() + ")");
    }
    dest.position(startPos);
    dest.limit(newPos);
}
Also used : DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) BufferChunk(org.apache.orc.impl.BufferChunk)

Example 5 with BufferChunk

use of org.apache.orc.impl.BufferChunk in project hive by apache.

the class EncodedReaderImpl method preReadUncompressedStream.

/**
   * To achieve some sort of consistent cache boundaries, we will cache streams deterministically;
   * in segments starting w/stream start, and going for either stream size or some fixed size.
   * If we are not reading the entire segment's worth of data, then we will not cache the partial
   * RGs; the breakage of cache assumptions (no interleaving blocks, etc.) is way too much PITA
   * to handle just for this case.
   * We could avoid copy in non-zcr case and manage the buffer that was not allocated by our
   * allocator. Uncompressed case is not mainline though so let's not complicate it.
   */
private DiskRangeList preReadUncompressedStream(long baseOffset, DiskRangeList start, long streamOffset, long streamEnd) throws IOException {
    if (streamOffset == streamEnd)
        return null;
    List<UncompressedCacheChunk> toCache = null;
    List<ByteBuffer> toRelease = null;
    // 1. Find our bearings in the stream.
    DiskRangeList current = findIntersectingPosition(start, streamOffset, streamEnd);
    if (isTracingEnabled) {
        LOG.trace("Starting pre-read for [" + streamOffset + "," + streamEnd + ") at " + current);
    }
    if (streamOffset > current.getOffset()) {
        // Target compression block is in the middle of the range; slice the range in two.
        current = current.split(streamOffset).next;
    }
    // Account for maximum cache buffer size.
    long streamLen = streamEnd - streamOffset;
    int partSize = determineUncompressedPartSize(), partCount = (int) (streamLen / partSize) + (((streamLen % partSize) != 0) ? 1 : 0);
    CacheChunk lastUncompressed = null;
    MemoryBuffer[] singleAlloc = new MemoryBuffer[1];
    for (int i = 0; i < partCount; ++i) {
        long partOffset = streamOffset + (i * partSize), partEnd = Math.min(partOffset + partSize, streamEnd);
        // We have 0 bytes of data for this part, for now.
        long hasEntirePartTo = partOffset;
        if (current == null) {
            // We have no data from this point on (could be unneeded), skip.
            break;
        }
        assert partOffset <= current.getOffset();
        if (partOffset == current.getOffset() && current instanceof CacheChunk) {
            // We assume cache chunks would always match the way we read, so check and skip it.
            assert current.getOffset() == partOffset && current.getEnd() == partEnd;
            lastUncompressed = (CacheChunk) current;
            current = current.next;
            continue;
        }
        if (current.getOffset() >= partEnd) {
            // We have no data at all for this part of the stream (could be unneeded), skip.
            continue;
        }
        if (toRelease == null && dataReader.isTrackingDiskRanges()) {
            toRelease = new ArrayList<ByteBuffer>();
        }
        // We have some disk buffers... see if we have entire part, etc.
        // We will cache if we have the entire part.
        UncompressedCacheChunk candidateCached = null;
        DiskRangeList next = current;
        while (true) {
            boolean noMoreDataForPart = (next == null || next.getOffset() >= partEnd);
            if (noMoreDataForPart && hasEntirePartTo < partEnd && candidateCached != null) {
                // We are missing a section at the end of the part... copy the start to non-cached.
                lastUncompressed = copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hasEntirePartTo, cacheWrapper, singleAlloc);
                candidateCached = null;
            }
            current = next;
            // Done with this part.
            if (noMoreDataForPart)
                break;
            boolean wasSplit = false;
            if (current.getEnd() > partEnd) {
                // If the current buffer contains multiple parts, split it.
                current = current.split(partEnd);
                wasSplit = true;
            }
            if (isTracingEnabled) {
                LOG.trace("Processing uncompressed file data at [" + current.getOffset() + ", " + current.getEnd() + ")");
            }
            BufferChunk curBc = (BufferChunk) current;
            if (!wasSplit && toRelease != null) {
                // TODO: is it valid to give zcr the modified 2nd part?
                toRelease.add(curBc.getChunk());
            }
            // Track if we still have the entire part.
            long hadEntirePartTo = hasEntirePartTo;
            // We have data until the end of current block if we had it until the beginning.
            hasEntirePartTo = (hasEntirePartTo == current.getOffset()) ? current.getEnd() : -1;
            if (hasEntirePartTo == -1) {
                // with gaps, but it's probably not needed.
                if (candidateCached != null) {
                    assert hadEntirePartTo != -1;
                    copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hadEntirePartTo, cacheWrapper, singleAlloc);
                    candidateCached = null;
                }
                lastUncompressed = copyAndReplaceUncompressedToNonCached(curBc, cacheWrapper, singleAlloc);
                // There may be more data after the gap.
                next = lastUncompressed.next;
            } else {
                // So far we have all the data from the beginning of the part.
                if (candidateCached == null) {
                    candidateCached = new UncompressedCacheChunk(curBc);
                } else {
                    candidateCached.addChunk(curBc);
                }
                next = current.next;
            }
        }
        if (candidateCached != null) {
            if (toCache == null) {
                toCache = new ArrayList<>(partCount - i);
            }
            toCache.add(candidateCached);
        }
    }
    // Nothing to copy and cache.
    if (toCache == null)
        return lastUncompressed;
    MemoryBuffer[] targetBuffers = toCache.size() == 1 ? singleAlloc : new MemoryBuffer[toCache.size()];
    targetBuffers[0] = null;
    DiskRange[] cacheKeys = new DiskRange[toCache.size()];
    int ix = 0;
    for (UncompressedCacheChunk chunk : toCache) {
        // Relies on the fact that cache does not actually store these.
        cacheKeys[ix] = chunk;
        ++ix;
    }
    cacheWrapper.getAllocator().allocateMultiple(targetBuffers, (int) (partCount == 1 ? streamLen : partSize));
    // 4. Now copy the data into cache buffers.
    ix = 0;
    for (UncompressedCacheChunk candidateCached : toCache) {
        candidateCached.setBuffer(targetBuffers[ix]);
        ByteBuffer dest = candidateCached.getBuffer().getByteBufferRaw();
        copyAndReplaceUncompressedChunks(candidateCached, dest, candidateCached);
        candidateCached.clear();
        lastUncompressed = candidateCached;
        ++ix;
    }
    // 5. Release original compressed buffers to zero-copy reader if needed.
    if (toRelease != null) {
        assert dataReader.isTrackingDiskRanges();
        for (ByteBuffer buf : toRelease) {
            dataReader.releaseBuffer(buf);
        }
    }
    // 6. Finally, put uncompressed data to cache.
    if (fileKey != null) {
        long[] collisionMask = cacheWrapper.putFileData(fileKey, cacheKeys, targetBuffers, baseOffset);
        processCacheCollisions(collisionMask, toCache, targetBuffers, null);
    }
    return lastUncompressed;
}
Also used : DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) BufferChunk(org.apache.orc.impl.BufferChunk) ByteBuffer(java.nio.ByteBuffer) MemoryBuffer(org.apache.hadoop.hive.common.io.encoded.MemoryBuffer) DiskRange(org.apache.hadoop.hive.common.io.DiskRange)

Aggregations

BufferChunk (org.apache.orc.impl.BufferChunk)5 DiskRangeList (org.apache.hadoop.hive.common.io.DiskRangeList)4 ByteBuffer (java.nio.ByteBuffer)3 IOException (java.io.IOException)2 MemoryBuffer (org.apache.hadoop.hive.common.io.encoded.MemoryBuffer)2 DiskRangeInfo (org.apache.hadoop.hive.common.DiskRangeInfo)1 DiskRange (org.apache.hadoop.hive.common.io.DiskRange)1