Search in sources :

Example 11 with MemoryBuffer

use of in project hive by apache.

the class OrcEncodedDataReader method returnData.

public void returnData(OrcEncodedColumnBatch ecb) {
    for (int colIx = 0; colIx < ecb.getTotalColCount(); ++colIx) {
        if (!ecb.hasData(colIx))
        ColumnStreamData[] datas = ecb.getColumnData(colIx);
        for (ColumnStreamData data : datas) {
            if (data == null || data.decRef() != 0)
            if (LlapIoImpl.LOCKING_LOGGER.isTraceEnabled()) {
                for (MemoryBuffer buf : data.getCacheBuffers()) {
                    LlapIoImpl.LOCKING_LOGGER.trace("Unlocking {} at the end of processing", buf);
    // We can offer ECB even with some streams not discarded; reset() will clear the arrays.
Also used : MemoryBuffer( ColumnStreamData(

Example 12 with MemoryBuffer

use of in project hive by apache.

the class TestLowLevelCacheImpl method verifyCacheGet.

private void verifyCacheGet(LowLevelCacheImpl cache, long fileId, Object... stuff) {
    CreateHelper list = new CreateHelper();
    DiskRangeList iter = null;
    int intCount = 0, lastInt = -1;
    int resultCount = stuff.length;
    for (Object obj : stuff) {
        if (obj instanceof Integer) {
            assertTrue(intCount >= 0);
            if (intCount == 0) {
                lastInt = (Integer) obj;
                intCount = 1;
            } else {
                list.addOrMerge(lastInt, (Integer) obj, true, true);
                intCount = 0;
        } else if (intCount >= 0) {
            assertTrue(intCount == 0);
            intCount = -1;
            iter = cache.getFileData(fileId, list.get(), 0, testFactory, null, null);
            assertEquals(resultCount, iter.listSize());
        assertTrue(iter != null);
        if (obj instanceof MemoryBuffer) {
            assertTrue(iter instanceof CacheChunk);
            assertSame(obj, ((CacheChunk) iter).getBuffer());
        } else {
        iter =;
Also used : CreateHelper( AtomicInteger(java.util.concurrent.atomic.AtomicInteger) MemoryBuffer( DiskRangeList( CacheChunk(

Example 13 with MemoryBuffer

use of in project hive by apache.

the class EncodedReaderImpl method preReadUncompressedStream.

   * To achieve some sort of consistent cache boundaries, we will cache streams deterministically;
   * in segments starting w/stream start, and going for either stream size or some fixed size.
   * If we are not reading the entire segment's worth of data, then we will not cache the partial
   * RGs; the breakage of cache assumptions (no interleaving blocks, etc.) is way too much PITA
   * to handle just for this case.
   * We could avoid copy in non-zcr case and manage the buffer that was not allocated by our
   * allocator. Uncompressed case is not mainline though so let's not complicate it.
private DiskRangeList preReadUncompressedStream(long baseOffset, DiskRangeList start, long streamOffset, long streamEnd) throws IOException {
    if (streamOffset == streamEnd)
        return null;
    List<UncompressedCacheChunk> toCache = null;
    List<ByteBuffer> toRelease = null;
    // 1. Find our bearings in the stream.
    DiskRangeList current = findIntersectingPosition(start, streamOffset, streamEnd);
    if (isTracingEnabled) {
        LOG.trace("Starting pre-read for [" + streamOffset + "," + streamEnd + ") at " + current);
    if (streamOffset > current.getOffset()) {
        // Target compression block is in the middle of the range; slice the range in two.
        current = current.split(streamOffset).next;
    // Account for maximum cache buffer size.
    long streamLen = streamEnd - streamOffset;
    int partSize = determineUncompressedPartSize(), partCount = (int) (streamLen / partSize) + (((streamLen % partSize) != 0) ? 1 : 0);
    CacheChunk lastUncompressed = null;
    MemoryBuffer[] singleAlloc = new MemoryBuffer[1];
    for (int i = 0; i < partCount; ++i) {
        long partOffset = streamOffset + (i * partSize), partEnd = Math.min(partOffset + partSize, streamEnd);
        // We have 0 bytes of data for this part, for now.
        long hasEntirePartTo = partOffset;
        if (current == null) {
            // We have no data from this point on (could be unneeded), skip.
        assert partOffset <= current.getOffset();
        if (partOffset == current.getOffset() && current instanceof CacheChunk) {
            // We assume cache chunks would always match the way we read, so check and skip it.
            assert current.getOffset() == partOffset && current.getEnd() == partEnd;
            lastUncompressed = (CacheChunk) current;
            current =;
        if (current.getOffset() >= partEnd) {
            // We have no data at all for this part of the stream (could be unneeded), skip.
        if (toRelease == null && dataReader.isTrackingDiskRanges()) {
            toRelease = new ArrayList<ByteBuffer>();
        // We have some disk buffers... see if we have entire part, etc.
        // We will cache if we have the entire part.
        UncompressedCacheChunk candidateCached = null;
        DiskRangeList next = current;
        while (true) {
            boolean noMoreDataForPart = (next == null || next.getOffset() >= partEnd);
            if (noMoreDataForPart && hasEntirePartTo < partEnd && candidateCached != null) {
                // We are missing a section at the end of the part... copy the start to non-cached.
                lastUncompressed = copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hasEntirePartTo, cacheWrapper, singleAlloc);
                candidateCached = null;
            current = next;
            // Done with this part.
            if (noMoreDataForPart)
            boolean wasSplit = false;
            if (current.getEnd() > partEnd) {
                // If the current buffer contains multiple parts, split it.
                current = current.split(partEnd);
                wasSplit = true;
            if (isTracingEnabled) {
                LOG.trace("Processing uncompressed file data at [" + current.getOffset() + ", " + current.getEnd() + ")");
            BufferChunk curBc = (BufferChunk) current;
            if (!wasSplit && toRelease != null) {
                // TODO: is it valid to give zcr the modified 2nd part?
            // Track if we still have the entire part.
            long hadEntirePartTo = hasEntirePartTo;
            // We have data until the end of current block if we had it until the beginning.
            hasEntirePartTo = (hasEntirePartTo == current.getOffset()) ? current.getEnd() : -1;
            if (hasEntirePartTo == -1) {
                // with gaps, but it's probably not needed.
                if (candidateCached != null) {
                    assert hadEntirePartTo != -1;
                    copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hadEntirePartTo, cacheWrapper, singleAlloc);
                    candidateCached = null;
                lastUncompressed = copyAndReplaceUncompressedToNonCached(curBc, cacheWrapper, singleAlloc);
                // There may be more data after the gap.
                next =;
            } else {
                // So far we have all the data from the beginning of the part.
                if (candidateCached == null) {
                    candidateCached = new UncompressedCacheChunk(curBc);
                } else {
                next =;
        if (candidateCached != null) {
            if (toCache == null) {
                toCache = new ArrayList<>(partCount - i);
    // Nothing to copy and cache.
    if (toCache == null)
        return lastUncompressed;
    MemoryBuffer[] targetBuffers = toCache.size() == 1 ? singleAlloc : new MemoryBuffer[toCache.size()];
    targetBuffers[0] = null;
    DiskRange[] cacheKeys = new DiskRange[toCache.size()];
    int ix = 0;
    for (UncompressedCacheChunk chunk : toCache) {
        // Relies on the fact that cache does not actually store these.
        cacheKeys[ix] = chunk;
    cacheWrapper.getAllocator().allocateMultiple(targetBuffers, (int) (partCount == 1 ? streamLen : partSize));
    // 4. Now copy the data into cache buffers.
    ix = 0;
    for (UncompressedCacheChunk candidateCached : toCache) {
        ByteBuffer dest = candidateCached.getBuffer().getByteBufferRaw();
        copyAndReplaceUncompressedChunks(candidateCached, dest, candidateCached);
        lastUncompressed = candidateCached;
    // 5. Release original compressed buffers to zero-copy reader if needed.
    if (toRelease != null) {
        assert dataReader.isTrackingDiskRanges();
        for (ByteBuffer buf : toRelease) {
    // 6. Finally, put uncompressed data to cache.
    if (fileKey != null) {
        long[] collisionMask = cacheWrapper.putFileData(fileKey, cacheKeys, targetBuffers, baseOffset);
        processCacheCollisions(collisionMask, toCache, targetBuffers, null);
    return lastUncompressed;
Also used : DiskRangeList( BufferChunk(org.apache.orc.impl.BufferChunk) ByteBuffer(java.nio.ByteBuffer) MemoryBuffer( DiskRange(

Example 14 with MemoryBuffer

use of in project hive by apache.

the class EncodedReaderImpl method readEncodedStream.

   * Uncompresses part of the stream. RGs can overlap, so we cannot just go and decompress
   * and remove what we have returned. We will keep iterator as a "hint" point.
   * @param baseOffset Absolute offset of boundaries and ranges relative to file, for cache keys.
   * @param start Ordered ranges containing file data. Helpful if they point close to cOffset.
   * @param cOffset Start offset to decompress.
   * @param endCOffset End offset to decompress; estimate, partial CBs will be ignored.
   * @param csd Stream data, to add the results.
   * @param unlockUntilCOffset The offset until which the buffers can be unlocked in cache, as
   *                           they will not be used in future calls (see the class comment in
   *                           EncodedReaderImpl about refcounts).
   * @return Last buffer cached during decompression. Cache buffers are never removed from
   *         the master list, so they are safe to keep as iterators for various streams.
public DiskRangeList readEncodedStream(long baseOffset, DiskRangeList start, long cOffset, long endCOffset, ColumnStreamData csd, long unlockUntilCOffset, long streamOffset) throws IOException {
    if (csd.getCacheBuffers() == null) {
        csd.setCacheBuffers(new ArrayList<MemoryBuffer>());
    } else {
    if (cOffset == endCOffset)
        return null;
    boolean isCompressed = codec != null;
    List<ProcCacheChunk> toDecompress = null;
    List<ByteBuffer> toRelease = null;
    List<IncompleteCb> badEstimates = null;
    if (isCompressed) {
        toRelease = !dataReader.isTrackingDiskRanges() ? null : new ArrayList<ByteBuffer>();
        toDecompress = new ArrayList<>();
        badEstimates = new ArrayList<>();
    // 1. Find our bearings in the stream. Normally, iter will already point either to where we
    // want to be, or just before. However, RGs can overlap due to encoding, so we may have
    // to return to a previous block.
    DiskRangeList current = findExactPosition(start, cOffset);
    if (isTracingEnabled) {
        LOG.trace("Starting read for [" + cOffset + "," + endCOffset + ") at " + current);
    CacheChunk lastUncompressed = null;
    // 2. Go thru the blocks; add stuff to results and prepare the decompression work (see below).
    try {
        lastUncompressed = isCompressed ? prepareRangesForCompressedRead(cOffset, endCOffset, streamOffset, unlockUntilCOffset, current, csd, toRelease, toDecompress, badEstimates) : prepareRangesForUncompressedRead(cOffset, endCOffset, streamOffset, unlockUntilCOffset, current, csd);
    } catch (Exception ex) {
        LOG.error("Failed " + (isCompressed ? "" : "un") + "compressed read; cOffset " + cOffset + ", endCOffset " + endCOffset + ", streamOffset " + streamOffset + ", unlockUntilCOffset " + unlockUntilCOffset + "; ranges passed in " + RecordReaderUtils.stringifyDiskRanges(start) + "; ranges passed to prepare " + // Don't log exception here.
        throw (ex instanceof IOException) ? (IOException) ex : new IOException(ex);
    // 2.5. Remember the bad estimates for future reference.
    if (badEstimates != null && !badEstimates.isEmpty()) {
        // Relies on the fact that cache does not actually store these.
        DiskRange[] cacheKeys = badEstimates.toArray(new DiskRange[badEstimates.size()]);
        long[] result = cacheWrapper.putFileData(fileKey, cacheKeys, null, baseOffset);
        // We don't expect conflicts from bad estimates.
        assert result == null;
    // Nothing to do.
    if (toDecompress == null || toDecompress.isEmpty())
        return lastUncompressed;
    // 3. Allocate the buffers, prepare cache keys.
    // At this point, we have read all the CBs we need to read. cacheBuffers contains some cache
    // data and some unallocated membufs for decompression. toDecompress contains all the work we
    // need to do, and each item points to one of the membufs in cacheBuffers as target. The iter
    // has also been adjusted to point to these buffers instead of compressed data for the ranges.
    MemoryBuffer[] targetBuffers = new MemoryBuffer[toDecompress.size()];
    DiskRange[] cacheKeys = new DiskRange[toDecompress.size()];
    int ix = 0;
    for (ProcCacheChunk chunk : toDecompress) {
        // Relies on the fact that cache does not actually store these.
        cacheKeys[ix] = chunk;
        targetBuffers[ix] = chunk.getBuffer();
    cacheWrapper.getAllocator().allocateMultiple(targetBuffers, bufferSize);
    // 4. Now decompress (or copy) the data into cache buffers.
    for (ProcCacheChunk chunk : toDecompress) {
        ByteBuffer dest = chunk.getBuffer().getByteBufferRaw();
        if (chunk.isOriginalDataCompressed) {
            decompressChunk(chunk.originalData, codec, dest);
        } else {
            copyUncompressedChunk(chunk.originalData, dest);
        chunk.originalData = null;
        if (isTracingEnabled) {
            LOG.trace("Locking " + chunk.getBuffer() + " due to reuse (after decompression)");
    // 5. Release original compressed buffers to zero-copy reader if needed.
    if (toRelease != null) {
        assert dataReader.isTrackingDiskRanges();
        for (ByteBuffer buffer : toRelease) {
    // 6. Finally, put uncompressed data to cache.
    if (fileKey != null) {
        long[] collisionMask = cacheWrapper.putFileData(fileKey, cacheKeys, targetBuffers, baseOffset);
        processCacheCollisions(collisionMask, toDecompress, targetBuffers, csd.getCacheBuffers());
    //    Release initial refcounts.
    for (ProcCacheChunk chunk : toDecompress) {
        ponderReleaseInitialRefcount(unlockUntilCOffset, streamOffset, chunk);
    return lastUncompressed;
Also used : DiskRangeList( ArrayList(java.util.ArrayList) IOException( ByteBuffer(java.nio.ByteBuffer) IOException( MemoryBuffer( DiskRange(

Example 15 with MemoryBuffer

use of in project hive by apache.

the class EncodedReaderImpl method preReadUncompressedStream.

 * To achieve some sort of consistent cache boundaries, we will cache streams deterministically;
 * in segments starting w/stream start, and going for either stream size or some fixed size.
 * If we are not reading the entire segment's worth of data, then we will not cache the partial
 * RGs; the breakage of cache assumptions (no interleaving blocks, etc.) is way too much PITA
 * to handle just for this case.
 * We could avoid copy in non-zcr case and manage the buffer that was not allocated by our
 * allocator. Uncompressed case is not mainline though so let's not complicate it.
 * @param kind
private DiskRangeList preReadUncompressedStream(long baseOffset, DiskRangeList start, long streamOffset, long streamEnd, Kind kind) throws IOException {
    if (streamOffset == streamEnd)
        return null;
    List<UncompressedCacheChunk> toCache = null;
    // 1. Find our bearings in the stream.
    DiskRangeList current = findIntersectingPosition(start, streamOffset, streamEnd);
    if (isTracingEnabled) {
        LOG.trace("Starting pre-read for [" + streamOffset + "," + streamEnd + ") at " + current);
    trace.logStartStream(kind, streamOffset, streamEnd, streamOffset);
    if (streamOffset > current.getOffset()) {
        // Target compression block is in the middle of the range; slice the range in two.
        current = current.split(streamOffset).next;
    // Account for maximum cache buffer size.
    long streamLen = streamEnd - streamOffset;
    int partSize = determineUncompressedPartSize(), partCount = (int) (streamLen / partSize) + (((streamLen % partSize) != 0) ? 1 : 0);
    CacheChunk lastUncompressed = null;
    MemoryBuffer[] singleAlloc = new MemoryBuffer[1];
    for (int i = 0; i < partCount; ++i) {
        long partOffset = streamOffset + (i * partSize), partEnd = Math.min(partOffset + partSize, streamEnd);
        // We have 0 bytes of data for this part, for now.
        long hasEntirePartTo = partOffset;
        if (current == null) {
            // We have no data from this point on (could be unneeded), skip.
        assert partOffset <= current.getOffset();
        if (partOffset == current.getOffset() && current instanceof CacheChunk) {
            // We assume cache chunks would always match the way we read, so check and skip it.
            assert current.getOffset() == partOffset && current.getEnd() == partEnd;
            lastUncompressed = (CacheChunk) current;
            current =;
        if (current.getOffset() >= partEnd) {
            // We have no data at all for this part of the stream (could be unneeded), skip.
        // We have some disk buffers... see if we have entire part, etc.
        // We will cache if we have the entire part.
        UncompressedCacheChunk candidateCached = null;
        DiskRangeList next = current;
        while (true) {
            boolean noMoreDataForPart = (next == null || next.getOffset() >= partEnd);
            if (noMoreDataForPart && hasEntirePartTo < partEnd && candidateCached != null) {
                // We are missing a section at the end of the part... copy the start to non-cached.
                lastUncompressed = copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hasEntirePartTo, cacheWrapper, singleAlloc);
                candidateCached = null;
            current = next;
            // Done with this part.
            if (noMoreDataForPart)
            if (current.getEnd() > partEnd) {
                // If the current buffer contains multiple parts, split it.
                current = current.split(partEnd);
            if (isTracingEnabled) {
                LOG.trace("Processing uncompressed file data at [" + current.getOffset() + ", " + current.getEnd() + ")");
            trace.logUncompressedData(current.getOffset(), current.getEnd());
            BufferChunk curBc = (BufferChunk) current;
            // Track if we still have the entire part.
            long hadEntirePartTo = hasEntirePartTo;
            // We have data until the end of current block if we had it until the beginning.
            hasEntirePartTo = (hasEntirePartTo == current.getOffset()) ? current.getEnd() : -1;
            if (hasEntirePartTo == -1) {
                // with gaps, but it's probably not needed.
                if (candidateCached != null) {
                    assert hadEntirePartTo != -1;
                    copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hadEntirePartTo, cacheWrapper, singleAlloc);
                    candidateCached = null;
                lastUncompressed = copyAndReplaceUncompressedToNonCached(curBc, cacheWrapper, singleAlloc);
                // There may be more data after the gap.
                next =;
            } else {
                // So far we have all the data from the beginning of the part.
                if (candidateCached == null) {
                    candidateCached = new UncompressedCacheChunk(curBc);
                } else {
                next =;
        if (candidateCached != null) {
            if (toCache == null) {
                toCache = new ArrayList<>(partCount - i);
    // Nothing to copy and cache.
    if (toCache == null)
        return lastUncompressed;
    MemoryBuffer[] targetBuffers = toCache.size() == 1 ? singleAlloc : new MemoryBuffer[toCache.size()];
    targetBuffers[0] = null;
    DiskRange[] cacheKeys = new DiskRange[toCache.size()];
    int ix = 0;
    for (UncompressedCacheChunk chunk : toCache) {
        // Relies on the fact that cache does not actually store these.
        cacheKeys[ix] = chunk;
    cacheWrapper.getAllocator().allocateMultiple(targetBuffers, (int) (partCount == 1 ? streamLen : partSize), cacheWrapper.getDataBufferFactory());
    // 4. Now copy the data into cache buffers.
    ix = 0;
    for (UncompressedCacheChunk candidateCached : toCache) {
        ByteBuffer dest = candidateCached.getBuffer().getByteBufferRaw();
        copyAndReplaceUncompressedChunks(candidateCached, dest, candidateCached, true);
        lastUncompressed = candidateCached;
    // 5. Put uncompressed data to cache.
    if (fileKey != null) {
        long[] collisionMask = cacheWrapper.putFileData(fileKey, cacheKeys, targetBuffers, baseOffset, tag);
        processCacheCollisions(collisionMask, toCache, targetBuffers, null);
    return lastUncompressed;
Also used : DiskRangeList( BufferChunk(org.apache.orc.impl.BufferChunk) ByteBuffer(java.nio.ByteBuffer) MemoryBuffer( DiskRange(


MemoryBuffer ( ByteBuffer (java.nio.ByteBuffer)12 DiskRangeList ( DiskRange ( IOException ( CreateHelper ( ColumnStreamData ( OrcProto (org.apache.orc.OrcProto)4 BufferChunk (org.apache.orc.impl.BufferChunk)4 CodedInputStream ( InputStream ( ArrayList (java.util.ArrayList)2 IdentityHashMap (java.util.IdentityHashMap)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 MutateHelper ( LlapBufferOrBuffers ( CacheChunk ( Stream (org.apache.orc.OrcProto.Stream)2 Kind (org.apache.orc.OrcProto.Stream.Kind)2 InStream (org.apache.orc.impl.InStream)2