use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.
the class OrcEncodedDataReader method returnData.
@Override
public void returnData(OrcEncodedColumnBatch ecb) {
for (int colIx = 0; colIx < ecb.getTotalColCount(); ++colIx) {
if (!ecb.hasData(colIx))
continue;
ColumnStreamData[] datas = ecb.getColumnData(colIx);
for (ColumnStreamData data : datas) {
if (data == null || data.decRef() != 0)
continue;
if (LlapIoImpl.LOCKING_LOGGER.isTraceEnabled()) {
for (MemoryBuffer buf : data.getCacheBuffers()) {
LlapIoImpl.LOCKING_LOGGER.trace("Unlocking {} at the end of processing", buf);
}
}
bufferManager.decRefBuffers(data.getCacheBuffers());
CSD_POOL.offer(data);
}
}
// We can offer ECB even with some streams not discarded; reset() will clear the arrays.
ECB_POOL.offer(ecb);
}
use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.
the class TestLowLevelCacheImpl method verifyCacheGet.
private void verifyCacheGet(LowLevelCacheImpl cache, long fileId, Object... stuff) {
CreateHelper list = new CreateHelper();
DiskRangeList iter = null;
int intCount = 0, lastInt = -1;
int resultCount = stuff.length;
for (Object obj : stuff) {
if (obj instanceof Integer) {
--resultCount;
assertTrue(intCount >= 0);
if (intCount == 0) {
lastInt = (Integer) obj;
intCount = 1;
} else {
list.addOrMerge(lastInt, (Integer) obj, true, true);
intCount = 0;
}
continue;
} else if (intCount >= 0) {
assertTrue(intCount == 0);
intCount = -1;
iter = cache.getFileData(fileId, list.get(), 0, testFactory, null, null);
assertEquals(resultCount, iter.listSize());
}
assertTrue(iter != null);
if (obj instanceof MemoryBuffer) {
assertTrue(iter instanceof CacheChunk);
assertSame(obj, ((CacheChunk) iter).getBuffer());
} else {
assertTrue(iter.equals(obj));
}
iter = iter.next;
}
}
use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.
the class EncodedReaderImpl method preReadUncompressedStream.
/**
* To achieve some sort of consistent cache boundaries, we will cache streams deterministically;
* in segments starting w/stream start, and going for either stream size or some fixed size.
* If we are not reading the entire segment's worth of data, then we will not cache the partial
* RGs; the breakage of cache assumptions (no interleaving blocks, etc.) is way too much PITA
* to handle just for this case.
* We could avoid copy in non-zcr case and manage the buffer that was not allocated by our
* allocator. Uncompressed case is not mainline though so let's not complicate it.
*/
private DiskRangeList preReadUncompressedStream(long baseOffset, DiskRangeList start, long streamOffset, long streamEnd) throws IOException {
if (streamOffset == streamEnd)
return null;
List<UncompressedCacheChunk> toCache = null;
List<ByteBuffer> toRelease = null;
// 1. Find our bearings in the stream.
DiskRangeList current = findIntersectingPosition(start, streamOffset, streamEnd);
if (isTracingEnabled) {
LOG.trace("Starting pre-read for [" + streamOffset + "," + streamEnd + ") at " + current);
}
if (streamOffset > current.getOffset()) {
// Target compression block is in the middle of the range; slice the range in two.
current = current.split(streamOffset).next;
}
// Account for maximum cache buffer size.
long streamLen = streamEnd - streamOffset;
int partSize = determineUncompressedPartSize(), partCount = (int) (streamLen / partSize) + (((streamLen % partSize) != 0) ? 1 : 0);
CacheChunk lastUncompressed = null;
MemoryBuffer[] singleAlloc = new MemoryBuffer[1];
for (int i = 0; i < partCount; ++i) {
long partOffset = streamOffset + (i * partSize), partEnd = Math.min(partOffset + partSize, streamEnd);
// We have 0 bytes of data for this part, for now.
long hasEntirePartTo = partOffset;
if (current == null) {
// We have no data from this point on (could be unneeded), skip.
break;
}
assert partOffset <= current.getOffset();
if (partOffset == current.getOffset() && current instanceof CacheChunk) {
// We assume cache chunks would always match the way we read, so check and skip it.
assert current.getOffset() == partOffset && current.getEnd() == partEnd;
lastUncompressed = (CacheChunk) current;
current = current.next;
continue;
}
if (current.getOffset() >= partEnd) {
// We have no data at all for this part of the stream (could be unneeded), skip.
continue;
}
if (toRelease == null && dataReader.isTrackingDiskRanges()) {
toRelease = new ArrayList<ByteBuffer>();
}
// We have some disk buffers... see if we have entire part, etc.
// We will cache if we have the entire part.
UncompressedCacheChunk candidateCached = null;
DiskRangeList next = current;
while (true) {
boolean noMoreDataForPart = (next == null || next.getOffset() >= partEnd);
if (noMoreDataForPart && hasEntirePartTo < partEnd && candidateCached != null) {
// We are missing a section at the end of the part... copy the start to non-cached.
lastUncompressed = copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hasEntirePartTo, cacheWrapper, singleAlloc);
candidateCached = null;
}
current = next;
// Done with this part.
if (noMoreDataForPart)
break;
boolean wasSplit = false;
if (current.getEnd() > partEnd) {
// If the current buffer contains multiple parts, split it.
current = current.split(partEnd);
wasSplit = true;
}
if (isTracingEnabled) {
LOG.trace("Processing uncompressed file data at [" + current.getOffset() + ", " + current.getEnd() + ")");
}
BufferChunk curBc = (BufferChunk) current;
if (!wasSplit && toRelease != null) {
// TODO: is it valid to give zcr the modified 2nd part?
toRelease.add(curBc.getChunk());
}
// Track if we still have the entire part.
long hadEntirePartTo = hasEntirePartTo;
// We have data until the end of current block if we had it until the beginning.
hasEntirePartTo = (hasEntirePartTo == current.getOffset()) ? current.getEnd() : -1;
if (hasEntirePartTo == -1) {
// with gaps, but it's probably not needed.
if (candidateCached != null) {
assert hadEntirePartTo != -1;
copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hadEntirePartTo, cacheWrapper, singleAlloc);
candidateCached = null;
}
lastUncompressed = copyAndReplaceUncompressedToNonCached(curBc, cacheWrapper, singleAlloc);
// There may be more data after the gap.
next = lastUncompressed.next;
} else {
// So far we have all the data from the beginning of the part.
if (candidateCached == null) {
candidateCached = new UncompressedCacheChunk(curBc);
} else {
candidateCached.addChunk(curBc);
}
next = current.next;
}
}
if (candidateCached != null) {
if (toCache == null) {
toCache = new ArrayList<>(partCount - i);
}
toCache.add(candidateCached);
}
}
// Nothing to copy and cache.
if (toCache == null)
return lastUncompressed;
MemoryBuffer[] targetBuffers = toCache.size() == 1 ? singleAlloc : new MemoryBuffer[toCache.size()];
targetBuffers[0] = null;
DiskRange[] cacheKeys = new DiskRange[toCache.size()];
int ix = 0;
for (UncompressedCacheChunk chunk : toCache) {
// Relies on the fact that cache does not actually store these.
cacheKeys[ix] = chunk;
++ix;
}
cacheWrapper.getAllocator().allocateMultiple(targetBuffers, (int) (partCount == 1 ? streamLen : partSize));
// 4. Now copy the data into cache buffers.
ix = 0;
for (UncompressedCacheChunk candidateCached : toCache) {
candidateCached.setBuffer(targetBuffers[ix]);
ByteBuffer dest = candidateCached.getBuffer().getByteBufferRaw();
copyAndReplaceUncompressedChunks(candidateCached, dest, candidateCached);
candidateCached.clear();
lastUncompressed = candidateCached;
++ix;
}
// 5. Release original compressed buffers to zero-copy reader if needed.
if (toRelease != null) {
assert dataReader.isTrackingDiskRanges();
for (ByteBuffer buf : toRelease) {
dataReader.releaseBuffer(buf);
}
}
// 6. Finally, put uncompressed data to cache.
if (fileKey != null) {
long[] collisionMask = cacheWrapper.putFileData(fileKey, cacheKeys, targetBuffers, baseOffset);
processCacheCollisions(collisionMask, toCache, targetBuffers, null);
}
return lastUncompressed;
}
use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.
the class EncodedReaderImpl method readEncodedStream.
/**
* Uncompresses part of the stream. RGs can overlap, so we cannot just go and decompress
* and remove what we have returned. We will keep iterator as a "hint" point.
* @param baseOffset Absolute offset of boundaries and ranges relative to file, for cache keys.
* @param start Ordered ranges containing file data. Helpful if they point close to cOffset.
* @param cOffset Start offset to decompress.
* @param endCOffset End offset to decompress; estimate, partial CBs will be ignored.
* @param csd Stream data, to add the results.
* @param unlockUntilCOffset The offset until which the buffers can be unlocked in cache, as
* they will not be used in future calls (see the class comment in
* EncodedReaderImpl about refcounts).
* @return Last buffer cached during decompression. Cache buffers are never removed from
* the master list, so they are safe to keep as iterators for various streams.
*/
public DiskRangeList readEncodedStream(long baseOffset, DiskRangeList start, long cOffset, long endCOffset, ColumnStreamData csd, long unlockUntilCOffset, long streamOffset) throws IOException {
if (csd.getCacheBuffers() == null) {
csd.setCacheBuffers(new ArrayList<MemoryBuffer>());
} else {
csd.getCacheBuffers().clear();
}
if (cOffset == endCOffset)
return null;
boolean isCompressed = codec != null;
List<ProcCacheChunk> toDecompress = null;
List<ByteBuffer> toRelease = null;
List<IncompleteCb> badEstimates = null;
if (isCompressed) {
toRelease = !dataReader.isTrackingDiskRanges() ? null : new ArrayList<ByteBuffer>();
toDecompress = new ArrayList<>();
badEstimates = new ArrayList<>();
}
// 1. Find our bearings in the stream. Normally, iter will already point either to where we
// want to be, or just before. However, RGs can overlap due to encoding, so we may have
// to return to a previous block.
DiskRangeList current = findExactPosition(start, cOffset);
if (isTracingEnabled) {
LOG.trace("Starting read for [" + cOffset + "," + endCOffset + ") at " + current);
}
CacheChunk lastUncompressed = null;
// 2. Go thru the blocks; add stuff to results and prepare the decompression work (see below).
try {
lastUncompressed = isCompressed ? prepareRangesForCompressedRead(cOffset, endCOffset, streamOffset, unlockUntilCOffset, current, csd, toRelease, toDecompress, badEstimates) : prepareRangesForUncompressedRead(cOffset, endCOffset, streamOffset, unlockUntilCOffset, current, csd);
} catch (Exception ex) {
LOG.error("Failed " + (isCompressed ? "" : "un") + "compressed read; cOffset " + cOffset + ", endCOffset " + endCOffset + ", streamOffset " + streamOffset + ", unlockUntilCOffset " + unlockUntilCOffset + "; ranges passed in " + RecordReaderUtils.stringifyDiskRanges(start) + "; ranges passed to prepare " + // Don't log exception here.
RecordReaderUtils.stringifyDiskRanges(current));
throw (ex instanceof IOException) ? (IOException) ex : new IOException(ex);
}
// 2.5. Remember the bad estimates for future reference.
if (badEstimates != null && !badEstimates.isEmpty()) {
// Relies on the fact that cache does not actually store these.
DiskRange[] cacheKeys = badEstimates.toArray(new DiskRange[badEstimates.size()]);
long[] result = cacheWrapper.putFileData(fileKey, cacheKeys, null, baseOffset);
// We don't expect conflicts from bad estimates.
assert result == null;
}
// Nothing to do.
if (toDecompress == null || toDecompress.isEmpty())
return lastUncompressed;
// 3. Allocate the buffers, prepare cache keys.
// At this point, we have read all the CBs we need to read. cacheBuffers contains some cache
// data and some unallocated membufs for decompression. toDecompress contains all the work we
// need to do, and each item points to one of the membufs in cacheBuffers as target. The iter
// has also been adjusted to point to these buffers instead of compressed data for the ranges.
MemoryBuffer[] targetBuffers = new MemoryBuffer[toDecompress.size()];
DiskRange[] cacheKeys = new DiskRange[toDecompress.size()];
int ix = 0;
for (ProcCacheChunk chunk : toDecompress) {
// Relies on the fact that cache does not actually store these.
cacheKeys[ix] = chunk;
targetBuffers[ix] = chunk.getBuffer();
++ix;
}
cacheWrapper.getAllocator().allocateMultiple(targetBuffers, bufferSize);
// 4. Now decompress (or copy) the data into cache buffers.
for (ProcCacheChunk chunk : toDecompress) {
ByteBuffer dest = chunk.getBuffer().getByteBufferRaw();
if (chunk.isOriginalDataCompressed) {
decompressChunk(chunk.originalData, codec, dest);
} else {
copyUncompressedChunk(chunk.originalData, dest);
}
chunk.originalData = null;
if (isTracingEnabled) {
LOG.trace("Locking " + chunk.getBuffer() + " due to reuse (after decompression)");
}
cacheWrapper.reuseBuffer(chunk.getBuffer());
}
// 5. Release original compressed buffers to zero-copy reader if needed.
if (toRelease != null) {
assert dataReader.isTrackingDiskRanges();
for (ByteBuffer buffer : toRelease) {
dataReader.releaseBuffer(buffer);
}
}
// 6. Finally, put uncompressed data to cache.
if (fileKey != null) {
long[] collisionMask = cacheWrapper.putFileData(fileKey, cacheKeys, targetBuffers, baseOffset);
processCacheCollisions(collisionMask, toDecompress, targetBuffers, csd.getCacheBuffers());
}
// Release initial refcounts.
for (ProcCacheChunk chunk : toDecompress) {
ponderReleaseInitialRefcount(unlockUntilCOffset, streamOffset, chunk);
}
return lastUncompressed;
}
use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.
the class EncodedReaderImpl method preReadUncompressedStream.
/**
* To achieve some sort of consistent cache boundaries, we will cache streams deterministically;
* in segments starting w/stream start, and going for either stream size or some fixed size.
* If we are not reading the entire segment's worth of data, then we will not cache the partial
* RGs; the breakage of cache assumptions (no interleaving blocks, etc.) is way too much PITA
* to handle just for this case.
* We could avoid copy in non-zcr case and manage the buffer that was not allocated by our
* allocator. Uncompressed case is not mainline though so let's not complicate it.
* @param kind
*/
private DiskRangeList preReadUncompressedStream(long baseOffset, DiskRangeList start, long streamOffset, long streamEnd, Kind kind) throws IOException {
if (streamOffset == streamEnd)
return null;
List<UncompressedCacheChunk> toCache = null;
// 1. Find our bearings in the stream.
DiskRangeList current = findIntersectingPosition(start, streamOffset, streamEnd);
if (isTracingEnabled) {
LOG.trace("Starting pre-read for [" + streamOffset + "," + streamEnd + ") at " + current);
}
trace.logStartStream(kind, streamOffset, streamEnd, streamOffset);
trace.logStartRead(current);
if (streamOffset > current.getOffset()) {
// Target compression block is in the middle of the range; slice the range in two.
current = current.split(streamOffset).next;
}
// Account for maximum cache buffer size.
long streamLen = streamEnd - streamOffset;
int partSize = determineUncompressedPartSize(), partCount = (int) (streamLen / partSize) + (((streamLen % partSize) != 0) ? 1 : 0);
CacheChunk lastUncompressed = null;
MemoryBuffer[] singleAlloc = new MemoryBuffer[1];
for (int i = 0; i < partCount; ++i) {
long partOffset = streamOffset + (i * partSize), partEnd = Math.min(partOffset + partSize, streamEnd);
// We have 0 bytes of data for this part, for now.
long hasEntirePartTo = partOffset;
if (current == null) {
// We have no data from this point on (could be unneeded), skip.
break;
}
assert partOffset <= current.getOffset();
if (partOffset == current.getOffset() && current instanceof CacheChunk) {
// We assume cache chunks would always match the way we read, so check and skip it.
assert current.getOffset() == partOffset && current.getEnd() == partEnd;
lastUncompressed = (CacheChunk) current;
current = current.next;
continue;
}
if (current.getOffset() >= partEnd) {
// We have no data at all for this part of the stream (could be unneeded), skip.
continue;
}
// We have some disk buffers... see if we have entire part, etc.
// We will cache if we have the entire part.
UncompressedCacheChunk candidateCached = null;
DiskRangeList next = current;
while (true) {
boolean noMoreDataForPart = (next == null || next.getOffset() >= partEnd);
if (noMoreDataForPart && hasEntirePartTo < partEnd && candidateCached != null) {
// We are missing a section at the end of the part... copy the start to non-cached.
lastUncompressed = copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hasEntirePartTo, cacheWrapper, singleAlloc);
candidateCached = null;
}
current = next;
// Done with this part.
if (noMoreDataForPart)
break;
if (current.getEnd() > partEnd) {
// If the current buffer contains multiple parts, split it.
current = current.split(partEnd);
}
if (isTracingEnabled) {
LOG.trace("Processing uncompressed file data at [" + current.getOffset() + ", " + current.getEnd() + ")");
}
trace.logUncompressedData(current.getOffset(), current.getEnd());
BufferChunk curBc = (BufferChunk) current;
// Track if we still have the entire part.
long hadEntirePartTo = hasEntirePartTo;
// We have data until the end of current block if we had it until the beginning.
hasEntirePartTo = (hasEntirePartTo == current.getOffset()) ? current.getEnd() : -1;
if (hasEntirePartTo == -1) {
// with gaps, but it's probably not needed.
if (candidateCached != null) {
assert hadEntirePartTo != -1;
copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hadEntirePartTo, cacheWrapper, singleAlloc);
candidateCached = null;
}
lastUncompressed = copyAndReplaceUncompressedToNonCached(curBc, cacheWrapper, singleAlloc);
// There may be more data after the gap.
next = lastUncompressed.next;
} else {
// So far we have all the data from the beginning of the part.
if (candidateCached == null) {
candidateCached = new UncompressedCacheChunk(curBc);
} else {
candidateCached.addChunk(curBc);
}
next = current.next;
}
}
if (candidateCached != null) {
if (toCache == null) {
toCache = new ArrayList<>(partCount - i);
}
toCache.add(candidateCached);
}
}
// Nothing to copy and cache.
if (toCache == null)
return lastUncompressed;
MemoryBuffer[] targetBuffers = toCache.size() == 1 ? singleAlloc : new MemoryBuffer[toCache.size()];
targetBuffers[0] = null;
DiskRange[] cacheKeys = new DiskRange[toCache.size()];
int ix = 0;
for (UncompressedCacheChunk chunk : toCache) {
// Relies on the fact that cache does not actually store these.
cacheKeys[ix] = chunk;
++ix;
}
cacheWrapper.getAllocator().allocateMultiple(targetBuffers, (int) (partCount == 1 ? streamLen : partSize), cacheWrapper.getDataBufferFactory());
// 4. Now copy the data into cache buffers.
ix = 0;
for (UncompressedCacheChunk candidateCached : toCache) {
candidateCached.setBuffer(targetBuffers[ix]);
ByteBuffer dest = candidateCached.getBuffer().getByteBufferRaw();
copyAndReplaceUncompressedChunks(candidateCached, dest, candidateCached, true);
candidateCached.clear();
lastUncompressed = candidateCached;
++ix;
}
// 5. Put uncompressed data to cache.
if (fileKey != null) {
long[] collisionMask = cacheWrapper.putFileData(fileKey, cacheKeys, targetBuffers, baseOffset, tag);
processCacheCollisions(collisionMask, toCache, targetBuffers, null);
}
return lastUncompressed;
}
Aggregations