use of org.apache.orc.impl.BufferChunk in project hive by apache.
the class StreamUtils method createDiskRangeInfo.
/**
* Converts stream buffers to disk ranges.
* @param streamBuffer - stream buffer
* @return - total length of disk ranges
*/
public static DiskRangeInfo createDiskRangeInfo(ColumnStreamData streamBuffer) {
DiskRangeInfo diskRangeInfo = new DiskRangeInfo(streamBuffer.getIndexBaseOffset());
// See ctor comment.
long offset = streamBuffer.getIndexBaseOffset();
// TODO: we should get rid of this
for (MemoryBuffer memoryBuffer : streamBuffer.getCacheBuffers()) {
ByteBuffer buffer = memoryBuffer.getByteBufferDup();
diskRangeInfo.addDiskRange(new BufferChunk(buffer, offset));
offset += buffer.remaining();
}
return diskRangeInfo;
}
use of org.apache.orc.impl.BufferChunk in project hive by apache.
the class EncodedReaderImpl method addOneCompressionBuffer.
/**
* Reads one compression block from the source; handles compression blocks read from
* multiple ranges (usually, that would only happen with zcr).
* Adds stuff to cachedBuffers, toDecompress and toRelease (see below what each does).
* @param current BufferChunk where compression block starts.
* @param cacheBuffers The result buffer array to add pre-allocated target cache buffer.
* @param toDecompress The list of work to decompress - pairs of compressed buffers and the
* target buffers (same as the ones added to cacheBuffers).
* @param toRelease The list of buffers to release to zcr because they are no longer in use.
* @param badEstimates The list of bad estimates that cannot be decompressed.
* @return The resulting cache chunk.
*/
private ProcCacheChunk addOneCompressionBuffer(BufferChunk current, List<MemoryBuffer> cacheBuffers, List<ProcCacheChunk> toDecompress, List<ByteBuffer> toRelease, List<IncompleteCb> badEstimates) throws IOException {
ByteBuffer slice = null;
ByteBuffer compressed = current.getChunk();
long cbStartOffset = current.getOffset();
int b0 = compressed.get() & 0xff;
int b1 = compressed.get() & 0xff;
int b2 = compressed.get() & 0xff;
int chunkLength = (b2 << 15) | (b1 << 7) | (b0 >> 1);
if (chunkLength > bufferSize) {
throw new IllegalArgumentException("Buffer size too small. size = " + bufferSize + " needed = " + chunkLength);
}
int consumedLength = chunkLength + OutStream.HEADER_SIZE;
long cbEndOffset = cbStartOffset + consumedLength;
boolean isUncompressed = ((b0 & 0x01) == 1);
if (isTracingEnabled) {
LOG.trace("Found CB at " + cbStartOffset + ", chunk length " + chunkLength + ", total " + consumedLength + ", " + (isUncompressed ? "not " : "") + "compressed");
}
if (compressed.remaining() >= chunkLength) {
// Simple case - CB fits entirely in the disk range.
slice = compressed.slice();
slice.limit(chunkLength);
ProcCacheChunk cc = addOneCompressionBlockByteBuffer(slice, isUncompressed, cbStartOffset, cbEndOffset, chunkLength, current, toDecompress, cacheBuffers);
if (compressed.remaining() <= 0 && dataReader.isTrackingDiskRanges()) {
toRelease.add(compressed);
}
return cc;
}
if (current.getEnd() < cbEndOffset && !current.hasContiguousNext()) {
badEstimates.add(addIncompleteCompressionBuffer(cbStartOffset, current, 0));
// This is impossible to read from this chunk.
return null;
}
// TODO: we could remove extra copy for isUncompressed case by copying directly to cache.
// We need to consolidate 2 or more buffers into one to decompress.
ByteBuffer copy = allocateBuffer(chunkLength, compressed.isDirect());
int remaining = chunkLength - compressed.remaining();
int originalPos = compressed.position();
copy.put(compressed);
if (isTracingEnabled) {
LOG.trace("Removing partial CB " + current + " from ranges after copying its contents");
}
DiskRangeList next = current.next;
current.removeSelf();
if (dataReader.isTrackingDiskRanges()) {
if (originalPos == 0) {
// We copied the entire buffer.
dataReader.releaseBuffer(compressed);
} else {
// There might be slices depending on this buffer.
toRelease.add(compressed);
}
}
int extraChunkCount = 0;
while (true) {
if (!(next instanceof BufferChunk)) {
throw new IOException("Trying to extend compressed block into uncompressed block " + next);
}
compressed = next.getData();
++extraChunkCount;
if (compressed.remaining() >= remaining) {
// This is the last range for this compression block. Yay!
slice = compressed.slice();
slice.limit(remaining);
copy.put(slice);
ProcCacheChunk cc = addOneCompressionBlockByteBuffer(copy, isUncompressed, cbStartOffset, cbEndOffset, remaining, (BufferChunk) next, toDecompress, cacheBuffers);
if (compressed.remaining() <= 0 && dataReader.isTrackingDiskRanges()) {
// We copied the entire buffer.
dataReader.releaseBuffer(compressed);
}
return cc;
}
remaining -= compressed.remaining();
copy.put(compressed);
if (dataReader.isTrackingDiskRanges()) {
// We copied the entire buffer.
dataReader.releaseBuffer(compressed);
}
DiskRangeList tmp = next;
next = next.hasContiguousNext() ? next.next : null;
if (next != null) {
if (isTracingEnabled) {
LOG.trace("Removing partial CB " + tmp + " from ranges after copying its contents");
}
tmp.removeSelf();
} else {
badEstimates.add(addIncompleteCompressionBuffer(cbStartOffset, tmp, extraChunkCount));
// This is impossible to read from this chunk.
return null;
}
}
}
use of org.apache.orc.impl.BufferChunk in project hive by apache.
the class EncodedReaderImpl method prepareRangesForCompressedRead.
private CacheChunk prepareRangesForCompressedRead(long cOffset, long endCOffset, long streamOffset, long unlockUntilCOffset, DiskRangeList current, ColumnStreamData columnStreamData, List<ByteBuffer> toRelease, List<ProcCacheChunk> toDecompress, List<IncompleteCb> badEstimates) throws IOException {
if (cOffset > current.getOffset()) {
// Target compression block is in the middle of the range; slice the range in two.
current = current.split(cOffset).next;
}
long currentOffset = cOffset;
CacheChunk lastUncompressed = null;
while (true) {
DiskRangeList next = null;
if (current instanceof CacheChunk) {
// 2a. This is a decoded compression buffer, add as is.
CacheChunk cc = (CacheChunk) current;
if (isTracingEnabled) {
LOG.trace("Locking " + cc.getBuffer() + " due to reuse");
}
cacheWrapper.reuseBuffer(cc.getBuffer());
columnStreamData.getCacheBuffers().add(cc.getBuffer());
currentOffset = cc.getEnd();
if (isTracingEnabled) {
LOG.trace("Adding an already-uncompressed buffer " + cc.getBuffer());
}
ponderReleaseInitialRefcount(unlockUntilCOffset, streamOffset, cc);
lastUncompressed = cc;
next = current.next;
if (next != null && (endCOffset >= 0 && currentOffset < endCOffset) && next.getOffset() >= endCOffset) {
throw new IOException("Expected data at " + currentOffset + " (reading until " + endCOffset + "), but the next buffer starts at " + next.getOffset());
}
} else if (current instanceof IncompleteCb) {
// 2b. This is a known incomplete CB caused by ORC CB end boundaries being estimates.
if (isTracingEnabled) {
LOG.trace("Cannot read " + current);
}
next = null;
currentOffset = -1;
} else {
// several disk ranges, so we might need to combine them.
if (!(current instanceof BufferChunk)) {
String msg = "Found an unexpected " + current.getClass().getSimpleName() + ": " + current + " while looking at " + currentOffset;
LOG.error(msg);
throw new RuntimeException(msg);
}
BufferChunk bc = (BufferChunk) current;
ProcCacheChunk newCached = addOneCompressionBuffer(bc, columnStreamData.getCacheBuffers(), toDecompress, toRelease, badEstimates);
lastUncompressed = (newCached == null) ? lastUncompressed : newCached;
next = (newCached != null) ? newCached.next : null;
currentOffset = (next != null) ? next.getOffset() : -1;
}
if (next == null || (endCOffset >= 0 && currentOffset >= endCOffset)) {
break;
}
current = next;
}
return lastUncompressed;
}
use of org.apache.orc.impl.BufferChunk in project hive by apache.
the class EncodedReaderImpl method copyAndReplaceUncompressedChunks.
private static void copyAndReplaceUncompressedChunks(UncompressedCacheChunk candidateCached, ByteBuffer dest, CacheChunk tcc) {
int startPos = dest.position(), startLim = dest.limit();
DiskRangeList next = null;
for (int i = 0; i < candidateCached.getCount(); ++i) {
BufferChunk chunk = (i == 0) ? candidateCached.getChunk() : (BufferChunk) next;
dest.put(chunk.getData());
next = chunk.next;
if (i == 0) {
chunk.replaceSelfWith(tcc);
} else {
chunk.removeSelf();
}
}
int newPos = dest.position();
if (newPos > startLim) {
throw new AssertionError("After copying, buffer [" + startPos + ", " + startLim + ") became [" + newPos + ", " + dest.limit() + ")");
}
dest.position(startPos);
dest.limit(newPos);
}
use of org.apache.orc.impl.BufferChunk in project hive by apache.
the class EncodedReaderImpl method preReadUncompressedStream.
/**
* To achieve some sort of consistent cache boundaries, we will cache streams deterministically;
* in segments starting w/stream start, and going for either stream size or some fixed size.
* If we are not reading the entire segment's worth of data, then we will not cache the partial
* RGs; the breakage of cache assumptions (no interleaving blocks, etc.) is way too much PITA
* to handle just for this case.
* We could avoid copy in non-zcr case and manage the buffer that was not allocated by our
* allocator. Uncompressed case is not mainline though so let's not complicate it.
*/
private DiskRangeList preReadUncompressedStream(long baseOffset, DiskRangeList start, long streamOffset, long streamEnd) throws IOException {
if (streamOffset == streamEnd)
return null;
List<UncompressedCacheChunk> toCache = null;
List<ByteBuffer> toRelease = null;
// 1. Find our bearings in the stream.
DiskRangeList current = findIntersectingPosition(start, streamOffset, streamEnd);
if (isTracingEnabled) {
LOG.trace("Starting pre-read for [" + streamOffset + "," + streamEnd + ") at " + current);
}
if (streamOffset > current.getOffset()) {
// Target compression block is in the middle of the range; slice the range in two.
current = current.split(streamOffset).next;
}
// Account for maximum cache buffer size.
long streamLen = streamEnd - streamOffset;
int partSize = determineUncompressedPartSize(), partCount = (int) (streamLen / partSize) + (((streamLen % partSize) != 0) ? 1 : 0);
CacheChunk lastUncompressed = null;
MemoryBuffer[] singleAlloc = new MemoryBuffer[1];
for (int i = 0; i < partCount; ++i) {
long partOffset = streamOffset + (i * partSize), partEnd = Math.min(partOffset + partSize, streamEnd);
// We have 0 bytes of data for this part, for now.
long hasEntirePartTo = partOffset;
if (current == null) {
// We have no data from this point on (could be unneeded), skip.
break;
}
assert partOffset <= current.getOffset();
if (partOffset == current.getOffset() && current instanceof CacheChunk) {
// We assume cache chunks would always match the way we read, so check and skip it.
assert current.getOffset() == partOffset && current.getEnd() == partEnd;
lastUncompressed = (CacheChunk) current;
current = current.next;
continue;
}
if (current.getOffset() >= partEnd) {
// We have no data at all for this part of the stream (could be unneeded), skip.
continue;
}
if (toRelease == null && dataReader.isTrackingDiskRanges()) {
toRelease = new ArrayList<ByteBuffer>();
}
// We have some disk buffers... see if we have entire part, etc.
// We will cache if we have the entire part.
UncompressedCacheChunk candidateCached = null;
DiskRangeList next = current;
while (true) {
boolean noMoreDataForPart = (next == null || next.getOffset() >= partEnd);
if (noMoreDataForPart && hasEntirePartTo < partEnd && candidateCached != null) {
// We are missing a section at the end of the part... copy the start to non-cached.
lastUncompressed = copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hasEntirePartTo, cacheWrapper, singleAlloc);
candidateCached = null;
}
current = next;
// Done with this part.
if (noMoreDataForPart)
break;
boolean wasSplit = false;
if (current.getEnd() > partEnd) {
// If the current buffer contains multiple parts, split it.
current = current.split(partEnd);
wasSplit = true;
}
if (isTracingEnabled) {
LOG.trace("Processing uncompressed file data at [" + current.getOffset() + ", " + current.getEnd() + ")");
}
BufferChunk curBc = (BufferChunk) current;
if (!wasSplit && toRelease != null) {
// TODO: is it valid to give zcr the modified 2nd part?
toRelease.add(curBc.getChunk());
}
// Track if we still have the entire part.
long hadEntirePartTo = hasEntirePartTo;
// We have data until the end of current block if we had it until the beginning.
hasEntirePartTo = (hasEntirePartTo == current.getOffset()) ? current.getEnd() : -1;
if (hasEntirePartTo == -1) {
// with gaps, but it's probably not needed.
if (candidateCached != null) {
assert hadEntirePartTo != -1;
copyAndReplaceCandidateToNonCached(candidateCached, partOffset, hadEntirePartTo, cacheWrapper, singleAlloc);
candidateCached = null;
}
lastUncompressed = copyAndReplaceUncompressedToNonCached(curBc, cacheWrapper, singleAlloc);
// There may be more data after the gap.
next = lastUncompressed.next;
} else {
// So far we have all the data from the beginning of the part.
if (candidateCached == null) {
candidateCached = new UncompressedCacheChunk(curBc);
} else {
candidateCached.addChunk(curBc);
}
next = current.next;
}
}
if (candidateCached != null) {
if (toCache == null) {
toCache = new ArrayList<>(partCount - i);
}
toCache.add(candidateCached);
}
}
// Nothing to copy and cache.
if (toCache == null)
return lastUncompressed;
MemoryBuffer[] targetBuffers = toCache.size() == 1 ? singleAlloc : new MemoryBuffer[toCache.size()];
targetBuffers[0] = null;
DiskRange[] cacheKeys = new DiskRange[toCache.size()];
int ix = 0;
for (UncompressedCacheChunk chunk : toCache) {
// Relies on the fact that cache does not actually store these.
cacheKeys[ix] = chunk;
++ix;
}
cacheWrapper.getAllocator().allocateMultiple(targetBuffers, (int) (partCount == 1 ? streamLen : partSize));
// 4. Now copy the data into cache buffers.
ix = 0;
for (UncompressedCacheChunk candidateCached : toCache) {
candidateCached.setBuffer(targetBuffers[ix]);
ByteBuffer dest = candidateCached.getBuffer().getByteBufferRaw();
copyAndReplaceUncompressedChunks(candidateCached, dest, candidateCached);
candidateCached.clear();
lastUncompressed = candidateCached;
++ix;
}
// 5. Release original compressed buffers to zero-copy reader if needed.
if (toRelease != null) {
assert dataReader.isTrackingDiskRanges();
for (ByteBuffer buf : toRelease) {
dataReader.releaseBuffer(buf);
}
}
// 6. Finally, put uncompressed data to cache.
if (fileKey != null) {
long[] collisionMask = cacheWrapper.putFileData(fileKey, cacheKeys, targetBuffers, baseOffset);
processCacheCollisions(collisionMask, toCache, targetBuffers, null);
}
return lastUncompressed;
}
Aggregations