Examples with MemoryBuffer - org.apache.hadoop.hive.common.io.encoded.MemoryBuffer

Example 11 with MemoryBuffer

use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.

the class OrcEncodedDataReader method returnData.

@Override
public void returnData(OrcEncodedColumnBatch ecb) {
    for (int colIx = 0; colIx < ecb.getTotalColCount(); ++colIx) {
        if (!ecb.hasData(colIx))
            continue;
        ColumnStreamData[] datas = ecb.getColumnData(colIx);
        for (ColumnStreamData data : datas) {
            if (data == null || data.decRef() != 0)
                continue;
            if (LlapIoImpl.LOCKING_LOGGER.isTraceEnabled()) {
                for (MemoryBuffer buf : data.getCacheBuffers()) {
                    LlapIoImpl.LOCKING_LOGGER.trace("Unlocking {} at the end of processing", buf);
                }
            }
            bufferManager.decRefBuffers(data.getCacheBuffers());
            CSD_POOL.offer(data);
        }
    }
    // We can offer ECB even with some streams not discarded; reset() will clear the arrays.
    ECB_POOL.offer(ecb);
}

Also used : MemoryBuffer(org.apache.hadoop.hive.common.io.encoded.MemoryBuffer) ColumnStreamData(org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData)

Example 12 with MemoryBuffer

use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.

the class TestLowLevelCacheImpl method verifyCacheGet.

private void verifyCacheGet(LowLevelCacheImpl cache, long fileId, Object... stuff) {
    CreateHelper list = new CreateHelper();
    DiskRangeList iter = null;
    int intCount = 0, lastInt = -1;
    int resultCount = stuff.length;
    for (Object obj : stuff) {
        if (obj instanceof Integer) {
            --resultCount;
            assertTrue(intCount >= 0);
            if (intCount == 0) {
                lastInt = (Integer) obj;
                intCount = 1;
            } else {
                list.addOrMerge(lastInt, (Integer) obj, true, true);
                intCount = 0;
            }
            continue;
        } else if (intCount >= 0) {
            assertTrue(intCount == 0);
            intCount = -1;
            iter = cache.getFileData(fileId, list.get(), 0, testFactory, null, null);
            assertEquals(resultCount, iter.listSize());
        }
        assertTrue(iter != null);
        if (obj instanceof MemoryBuffer) {
            assertTrue(iter instanceof CacheChunk);
            assertSame(obj, ((CacheChunk) iter).getBuffer());
        } else {
            assertTrue(iter.equals(obj));
        }
        iter = iter.next;
    }
}

Also used : CreateHelper(org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) MemoryBuffer(org.apache.hadoop.hive.common.io.encoded.MemoryBuffer) DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) CacheChunk(org.apache.hadoop.hive.ql.io.orc.encoded.CacheChunk)

Example 13 with MemoryBuffer

use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.

the class EncodedReaderImpl method preReadUncompressedStream.

/**
   * To achieve some sort of consistent cache boundaries, we will cache streams deterministically;
   * in segments starting w/stream start, and going for either stream size or some fixed size.
   * If we are not reading the entire segment's worth of data, then we will not cache the partial
   * RGs; the breakage of cache assumptions (no interleaving blocks, etc.) is way too much PITA
   * to handle just for this case.
   * We could avoid copy in non-zcr case and manage the buffer that was not allocated by our
   * allocator. Uncompressed case is not mainline though so let's not complicate it.
   */
private DiskRangeList preReadUncompressedStream(long baseOffset, DiskRangeList start, long streamOffset, long streamEnd) throws IOException {
    if (streamOffset == streamEnd)
        return null;
    List<UncompressedCacheChunk> toCache = null;
    List<ByteBuffer> toRelease = null;
    // 1. Find our bearings in the stream.
    DiskRangeList current = findIntersectingPosition(start, streamOffset, streamEnd);
    if (isTracingEnabled) {
        LOG.trace("Starting pre-read for [" + streamOffset + "," + streamEnd + ") at " + current);
    }
    if (streamOffset > current.getOffset()) {
        // Target compression block is in the middle of the range; slice the range in two.
        current = current.split(streamOffset).next;
    }
    // Account for maximum cache buffer size.
    long streamLen = streamEnd - streamOffset;
    int partSize = determineUncompressedPartSize(), partCount = (int) (streamLen / partSize) + (((streamLen % partSize) != 0) ? 1 : 0);
    CacheChunk lastUncompressed = null;
    MemoryBuffer[] singleAlloc = new MemoryBuffer[1];
    for (int i = 0; i < partCount; ++i) {
        long partOffset = streamOffset + (i * partSize), partEnd = Math.min(partOffset + partSize, streamEnd);
        // We have 0 bytes of data for this part, for now.
        long hasEntirePartTo = partOffset;
        if (current == null) {
            // We have no data from this point on (could be unneeded), skip.
            break;
        }
        assert partOffset <= current.getOffset();
        if (partOffset == current.getOffset() && current instanceof CacheChunk) {
            // We assume cache chunks would always match the way we read, so check and skip it.
            assert current.getOffset() == partOffset && current.getEnd() == partEnd;
            lastUncompressed = (CacheChunk) current;
            current = current.next;
            continue;
        }
        if (current.getOffset() >= partEnd) {
            // We have no data at all for this part of the stream (could be unneeded), skip.
            continue;
        }
        if (toRelease == null && dataReader.isTrackingDiskRanges()) {
            toRelease = new ArrayList<ByteBuffer>();
        }
        // We have some disk buffers... see if we have entire part, etc.
        // We will cache if we have the entire part.
        UncompressedCacheChunk candidateCached = null;
        DiskRangeList next = current;
        while (true) {
            boolean noMoreDataForPart = (next == null || next.getOffset() >= partEnd);
            if (noMoreDataForPart && hasEntirePartTo < partEnd && candidateCached != null) {
                // We are missing a section at the end of the part... copy the start to non-cached.
                lastUncompressed = copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hasEntirePartTo, cacheWrapper, singleAlloc);
                candidateCached = null;
            }
            current = next;
            // Done with this part.
            if (noMoreDataForPart)
                break;
            boolean wasSplit = false;
            if (current.getEnd() > partEnd) {
                // If the current buffer contains multiple parts, split it.
                current = current.split(partEnd);
                wasSplit = true;
            }
            if (isTracingEnabled) {
                LOG.trace("Processing uncompressed file data at [" + current.getOffset() + ", " + current.getEnd() + ")");
            }
            BufferChunk curBc = (BufferChunk) current;
            if (!wasSplit && toRelease != null) {
                // TODO: is it valid to give zcr the modified 2nd part?
                toRelease.add(curBc.getChunk());
            }
            // Track if we still have the entire part.
            long hadEntirePartTo = hasEntirePartTo;
            // We have data until the end of current block if we had it until the beginning.
            hasEntirePartTo = (hasEntirePartTo == current.getOffset()) ? current.getEnd() : -1;
            if (hasEntirePartTo == -1) {
                // with gaps, but it's probably not needed.
                if (candidateCached != null) {
                    assert hadEntirePartTo != -1;
                    copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hadEntirePartTo, cacheWrapper, singleAlloc);
                    candidateCached = null;
                }
                lastUncompressed = copyAndReplaceUncompressedToNonCached(curBc, cacheWrapper, singleAlloc);
                // There may be more data after the gap.
                next = lastUncompressed.next;
            } else {
                // So far we have all the data from the beginning of the part.
                if (candidateCached == null) {
                    candidateCached = new UncompressedCacheChunk(curBc);
                } else {
                    candidateCached.addChunk(curBc);
                }
                next = current.next;
            }
        }
        if (candidateCached != null) {
            if (toCache == null) {
                toCache = new ArrayList<>(partCount - i);
            }
            toCache.add(candidateCached);
        }
    }
    // Nothing to copy and cache.
    if (toCache == null)
        return lastUncompressed;
    MemoryBuffer[] targetBuffers = toCache.size() == 1 ? singleAlloc : new MemoryBuffer[toCache.size()];
    targetBuffers[0] = null;
    DiskRange[] cacheKeys = new DiskRange[toCache.size()];
    int ix = 0;
    for (UncompressedCacheChunk chunk : toCache) {
        // Relies on the fact that cache does not actually store these.
        cacheKeys[ix] = chunk;
        ++ix;
    }
    cacheWrapper.getAllocator().allocateMultiple(targetBuffers, (int) (partCount == 1 ? streamLen : partSize));
    // 4. Now copy the data into cache buffers.
    ix = 0;
    for (UncompressedCacheChunk candidateCached : toCache) {
        candidateCached.setBuffer(targetBuffers[ix]);
        ByteBuffer dest = candidateCached.getBuffer().getByteBufferRaw();
        copyAndReplaceUncompressedChunks(candidateCached, dest, candidateCached);
        candidateCached.clear();
        lastUncompressed = candidateCached;
        ++ix;
    }
    // 5. Release original compressed buffers to zero-copy reader if needed.
    if (toRelease != null) {
        assert dataReader.isTrackingDiskRanges();
        for (ByteBuffer buf : toRelease) {
            dataReader.releaseBuffer(buf);
        }
    }
    // 6. Finally, put uncompressed data to cache.
    if (fileKey != null) {
        long[] collisionMask = cacheWrapper.putFileData(fileKey, cacheKeys, targetBuffers, baseOffset);
        processCacheCollisions(collisionMask, toCache, targetBuffers, null);
    }
    return lastUncompressed;
}

Also used : DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) BufferChunk(org.apache.orc.impl.BufferChunk) ByteBuffer(java.nio.ByteBuffer) MemoryBuffer(org.apache.hadoop.hive.common.io.encoded.MemoryBuffer) DiskRange(org.apache.hadoop.hive.common.io.DiskRange)

Example 14 with MemoryBuffer

use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.

the class EncodedReaderImpl method readEncodedStream.

/**
   * Uncompresses part of the stream. RGs can overlap, so we cannot just go and decompress
   * and remove what we have returned. We will keep iterator as a "hint" point.
   * @param baseOffset Absolute offset of boundaries and ranges relative to file, for cache keys.
   * @param start Ordered ranges containing file data. Helpful if they point close to cOffset.
   * @param cOffset Start offset to decompress.
   * @param endCOffset End offset to decompress; estimate, partial CBs will be ignored.
   * @param csd Stream data, to add the results.
   * @param unlockUntilCOffset The offset until which the buffers can be unlocked in cache, as
   *                           they will not be used in future calls (see the class comment in
   *                           EncodedReaderImpl about refcounts).
   * @return Last buffer cached during decompression. Cache buffers are never removed from
   *         the master list, so they are safe to keep as iterators for various streams.
   */
public DiskRangeList readEncodedStream(long baseOffset, DiskRangeList start, long cOffset, long endCOffset, ColumnStreamData csd, long unlockUntilCOffset, long streamOffset) throws IOException {
    if (csd.getCacheBuffers() == null) {
        csd.setCacheBuffers(new ArrayList<MemoryBuffer>());
    } else {
        csd.getCacheBuffers().clear();
    }
    if (cOffset == endCOffset)
        return null;
    boolean isCompressed = codec != null;
    List<ProcCacheChunk> toDecompress = null;
    List<ByteBuffer> toRelease = null;
    List<IncompleteCb> badEstimates = null;
    if (isCompressed) {
        toRelease = !dataReader.isTrackingDiskRanges() ? null : new ArrayList<ByteBuffer>();
        toDecompress = new ArrayList<>();
        badEstimates = new ArrayList<>();
    }
    // 1. Find our bearings in the stream. Normally, iter will already point either to where we
    // want to be, or just before. However, RGs can overlap due to encoding, so we may have
    // to return to a previous block.
    DiskRangeList current = findExactPosition(start, cOffset);
    if (isTracingEnabled) {
        LOG.trace("Starting read for [" + cOffset + "," + endCOffset + ") at " + current);
    }
    CacheChunk lastUncompressed = null;
    // 2. Go thru the blocks; add stuff to results and prepare the decompression work (see below).
    try {
        lastUncompressed = isCompressed ? prepareRangesForCompressedRead(cOffset, endCOffset, streamOffset, unlockUntilCOffset, current, csd, toRelease, toDecompress, badEstimates) : prepareRangesForUncompressedRead(cOffset, endCOffset, streamOffset, unlockUntilCOffset, current, csd);
    } catch (Exception ex) {
        LOG.error("Failed " + (isCompressed ? "" : "un") + "compressed read; cOffset " + cOffset + ", endCOffset " + endCOffset + ", streamOffset " + streamOffset + ", unlockUntilCOffset " + unlockUntilCOffset + "; ranges passed in " + RecordReaderUtils.stringifyDiskRanges(start) + "; ranges passed to prepare " + // Don't log exception here.
        RecordReaderUtils.stringifyDiskRanges(current));
        throw (ex instanceof IOException) ? (IOException) ex : new IOException(ex);
    }
    // 2.5. Remember the bad estimates for future reference.
    if (badEstimates != null && !badEstimates.isEmpty()) {
        // Relies on the fact that cache does not actually store these.
        DiskRange[] cacheKeys = badEstimates.toArray(new DiskRange[badEstimates.size()]);
        long[] result = cacheWrapper.putFileData(fileKey, cacheKeys, null, baseOffset);
        // We don't expect conflicts from bad estimates.
        assert result == null;
    }
    // Nothing to do.
    if (toDecompress == null || toDecompress.isEmpty())
        return lastUncompressed;
    // 3. Allocate the buffers, prepare cache keys.
    // At this point, we have read all the CBs we need to read. cacheBuffers contains some cache
    // data and some unallocated membufs for decompression. toDecompress contains all the work we
    // need to do, and each item points to one of the membufs in cacheBuffers as target. The iter
    // has also been adjusted to point to these buffers instead of compressed data for the ranges.
    MemoryBuffer[] targetBuffers = new MemoryBuffer[toDecompress.size()];
    DiskRange[] cacheKeys = new DiskRange[toDecompress.size()];
    int ix = 0;
    for (ProcCacheChunk chunk : toDecompress) {
        // Relies on the fact that cache does not actually store these.
        cacheKeys[ix] = chunk;
        targetBuffers[ix] = chunk.getBuffer();
        ++ix;
    }
    cacheWrapper.getAllocator().allocateMultiple(targetBuffers, bufferSize);
    // 4. Now decompress (or copy) the data into cache buffers.
    for (ProcCacheChunk chunk : toDecompress) {
        ByteBuffer dest = chunk.getBuffer().getByteBufferRaw();
        if (chunk.isOriginalDataCompressed) {
            decompressChunk(chunk.originalData, codec, dest);
        } else {
            copyUncompressedChunk(chunk.originalData, dest);
        }
        chunk.originalData = null;
        if (isTracingEnabled) {
            LOG.trace("Locking " + chunk.getBuffer() + " due to reuse (after decompression)");
        }
        cacheWrapper.reuseBuffer(chunk.getBuffer());
    }
    // 5. Release original compressed buffers to zero-copy reader if needed.
    if (toRelease != null) {
        assert dataReader.isTrackingDiskRanges();
        for (ByteBuffer buffer : toRelease) {
            dataReader.releaseBuffer(buffer);
        }
    }
    // 6. Finally, put uncompressed data to cache.
    if (fileKey != null) {
        long[] collisionMask = cacheWrapper.putFileData(fileKey, cacheKeys, targetBuffers, baseOffset);
        processCacheCollisions(collisionMask, toDecompress, targetBuffers, csd.getCacheBuffers());
    }
    //    Release initial refcounts.
    for (ProcCacheChunk chunk : toDecompress) {
        ponderReleaseInitialRefcount(unlockUntilCOffset, streamOffset, chunk);
    }
    return lastUncompressed;
}

Also used : DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) ArrayList(java.util.ArrayList) IOException(java.io.IOException) ByteBuffer(java.nio.ByteBuffer) IOException(java.io.IOException) MemoryBuffer(org.apache.hadoop.hive.common.io.encoded.MemoryBuffer) DiskRange(org.apache.hadoop.hive.common.io.DiskRange)

Example 15 with MemoryBuffer

use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.

the class EncodedReaderImpl method preReadUncompressedStream.

/**
 * To achieve some sort of consistent cache boundaries, we will cache streams deterministically;
 * in segments starting w/stream start, and going for either stream size or some fixed size.
 * If we are not reading the entire segment's worth of data, then we will not cache the partial
 * RGs; the breakage of cache assumptions (no interleaving blocks, etc.) is way too much PITA
 * to handle just for this case.
 * We could avoid copy in non-zcr case and manage the buffer that was not allocated by our
 * allocator. Uncompressed case is not mainline though so let's not complicate it.
 * @param kind
 */
private DiskRangeList preReadUncompressedStream(long baseOffset, DiskRangeList start, long streamOffset, long streamEnd, Kind kind) throws IOException {
    if (streamOffset == streamEnd)
        return null;
    List<UncompressedCacheChunk> toCache = null;
    // 1. Find our bearings in the stream.
    DiskRangeList current = findIntersectingPosition(start, streamOffset, streamEnd);
    if (isTracingEnabled) {
        LOG.trace("Starting pre-read for [" + streamOffset + "," + streamEnd + ") at " + current);
    }
    trace.logStartStream(kind, streamOffset, streamEnd, streamOffset);
    trace.logStartRead(current);
    if (streamOffset > current.getOffset()) {
        // Target compression block is in the middle of the range; slice the range in two.
        current = current.split(streamOffset).next;
    }
    // Account for maximum cache buffer size.
    long streamLen = streamEnd - streamOffset;
    int partSize = determineUncompressedPartSize(), partCount = (int) (streamLen / partSize) + (((streamLen % partSize) != 0) ? 1 : 0);
    CacheChunk lastUncompressed = null;
    MemoryBuffer[] singleAlloc = new MemoryBuffer[1];
    for (int i = 0; i < partCount; ++i) {
        long partOffset = streamOffset + (i * partSize), partEnd = Math.min(partOffset + partSize, streamEnd);
        // We have 0 bytes of data for this part, for now.
        long hasEntirePartTo = partOffset;
        if (current == null) {
            // We have no data from this point on (could be unneeded), skip.
            break;
        }
        assert partOffset <= current.getOffset();
        if (partOffset == current.getOffset() && current instanceof CacheChunk) {
            // We assume cache chunks would always match the way we read, so check and skip it.
            assert current.getOffset() == partOffset && current.getEnd() == partEnd;
            lastUncompressed = (CacheChunk) current;
            current = current.next;
            continue;
        }
        if (current.getOffset() >= partEnd) {
            // We have no data at all for this part of the stream (could be unneeded), skip.
            continue;
        }
        // We have some disk buffers... see if we have entire part, etc.
        // We will cache if we have the entire part.
        UncompressedCacheChunk candidateCached = null;
        DiskRangeList next = current;
        while (true) {
            boolean noMoreDataForPart = (next == null || next.getOffset() >= partEnd);
            if (noMoreDataForPart && hasEntirePartTo < partEnd && candidateCached != null) {
                // We are missing a section at the end of the part... copy the start to non-cached.
                lastUncompressed = copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hasEntirePartTo, cacheWrapper, singleAlloc);
                candidateCached = null;
            }
            current = next;
            // Done with this part.
            if (noMoreDataForPart)
                break;
            if (current.getEnd() > partEnd) {
                // If the current buffer contains multiple parts, split it.
                current = current.split(partEnd);
            }
            if (isTracingEnabled) {
                LOG.trace("Processing uncompressed file data at [" + current.getOffset() + ", " + current.getEnd() + ")");
            }
            trace.logUncompressedData(current.getOffset(), current.getEnd());
            BufferChunk curBc = (BufferChunk) current;
            // Track if we still have the entire part.
            long hadEntirePartTo = hasEntirePartTo;
            // We have data until the end of current block if we had it until the beginning.
            hasEntirePartTo = (hasEntirePartTo == current.getOffset()) ? current.getEnd() : -1;
            if (hasEntirePartTo == -1) {
                // with gaps, but it's probably not needed.
                if (candidateCached != null) {
                    assert hadEntirePartTo != -1;
                    copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hadEntirePartTo, cacheWrapper, singleAlloc);
                    candidateCached = null;
                }
                lastUncompressed = copyAndReplaceUncompressedToNonCached(curBc, cacheWrapper, singleAlloc);
                // There may be more data after the gap.
                next = lastUncompressed.next;
            } else {
                // So far we have all the data from the beginning of the part.
                if (candidateCached == null) {
                    candidateCached = new UncompressedCacheChunk(curBc);
                } else {
                    candidateCached.addChunk(curBc);
                }
                next = current.next;
            }
        }
        if (candidateCached != null) {
            if (toCache == null) {
                toCache = new ArrayList<>(partCount - i);
            }
            toCache.add(candidateCached);
        }
    }
    // Nothing to copy and cache.
    if (toCache == null)
        return lastUncompressed;
    MemoryBuffer[] targetBuffers = toCache.size() == 1 ? singleAlloc : new MemoryBuffer[toCache.size()];
    targetBuffers[0] = null;
    DiskRange[] cacheKeys = new DiskRange[toCache.size()];
    int ix = 0;
    for (UncompressedCacheChunk chunk : toCache) {
        // Relies on the fact that cache does not actually store these.
        cacheKeys[ix] = chunk;
        ++ix;
    }
    cacheWrapper.getAllocator().allocateMultiple(targetBuffers, (int) (partCount == 1 ? streamLen : partSize), cacheWrapper.getDataBufferFactory());
    // 4. Now copy the data into cache buffers.
    ix = 0;
    for (UncompressedCacheChunk candidateCached : toCache) {
        candidateCached.setBuffer(targetBuffers[ix]);
        ByteBuffer dest = candidateCached.getBuffer().getByteBufferRaw();
        copyAndReplaceUncompressedChunks(candidateCached, dest, candidateCached, true);
        candidateCached.clear();
        lastUncompressed = candidateCached;
        ++ix;
    }
    // 5. Put uncompressed data to cache.
    if (fileKey != null) {
        long[] collisionMask = cacheWrapper.putFileData(fileKey, cacheKeys, targetBuffers, baseOffset, tag);
        processCacheCollisions(collisionMask, toCache, targetBuffers, null);
    }
    return lastUncompressed;
}

Aggregations

MemoryBuffer (org.apache.hadoop.hive.common.io.encoded.MemoryBuffer)22 ByteBuffer (java.nio.ByteBuffer)12 DiskRangeList (org.apache.hadoop.hive.common.io.DiskRangeList)10 DiskRange (org.apache.hadoop.hive.common.io.DiskRange)6 IOException (java.io.IOException)5 CreateHelper (org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper)4 ColumnStreamData (org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData)4 OrcProto (org.apache.orc.OrcProto)4 BufferChunk (org.apache.orc.impl.BufferChunk)4 CodedInputStream (com.google.protobuf.CodedInputStream)2 InputStream (java.io.InputStream)2 ArrayList (java.util.ArrayList)2 IdentityHashMap (java.util.IdentityHashMap)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 MutateHelper (org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper)2 LlapBufferOrBuffers (org.apache.hadoop.hive.llap.io.metadata.MetadataCache.LlapBufferOrBuffers)2 CacheChunk (org.apache.hadoop.hive.ql.io.orc.encoded.CacheChunk)2 Stream (org.apache.orc.OrcProto.Stream)2 Kind (org.apache.orc.OrcProto.Stream.Kind)2 InStream (org.apache.orc.impl.InStream)2