use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.
the class EncodedReaderImpl method copyAndReplaceCandidateToNonCached.
private CacheChunk copyAndReplaceCandidateToNonCached(UncompressedCacheChunk candidateCached, long partOffset, long candidateEnd, DataCache cacheWrapper, MemoryBuffer[] singleAlloc) {
// We thought we had the entire part to cache, but we don't; convert start to
// non-cached. Since we are at the first gap, the previous stuff must be contiguous.
singleAlloc[0] = null;
trace.logPartialUncompressedData(partOffset, candidateEnd, true);
cacheWrapper.getAllocator().allocateMultiple(singleAlloc, (int) (candidateEnd - partOffset), cacheWrapper.getDataBufferFactory());
MemoryBuffer buffer = singleAlloc[0];
cacheWrapper.reuseBuffer(buffer);
ByteBuffer dest = buffer.getByteBufferRaw();
CacheChunk tcc = new CacheChunk(buffer, partOffset, candidateEnd);
copyAndReplaceUncompressedChunks(candidateCached, dest, tcc, false);
return tcc;
}
use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.
the class EncodedReaderImpl method readIndexStreams.
@Override
public void readIndexStreams(OrcIndex index, StripeInformation stripe, List<OrcProto.Stream> streams, boolean[] physicalFileIncludes, boolean[] sargColumns) throws IOException {
long stripeOffset = stripe.getOffset();
DiskRangeList indexRanges = planIndexReading(fileSchema, streams, true, physicalFileIncludes, sargColumns, version, index.getBloomFilterKinds());
if (indexRanges == null) {
if (LOG.isDebugEnabled()) {
LOG.debug("Nothing to read for stripe [" + stripe + "]");
}
return;
}
ReadContext[] colCtxs = new ReadContext[physicalFileIncludes.length];
int colRgIx = -1;
for (int i = 0; i < physicalFileIncludes.length; ++i) {
if (!physicalFileIncludes[i] && (sargColumns == null || !sargColumns[i]))
continue;
colCtxs[i] = new ReadContext(i, ++colRgIx);
if (isTracingEnabled) {
LOG.trace("Creating context: " + colCtxs[i].toString());
}
// Bogus encoding.
trace.logColumnRead(i, colRgIx, ColumnEncoding.Kind.DIRECT);
}
long offset = 0;
for (OrcProto.Stream stream : streams) {
long length = stream.getLength();
int colIx = stream.getColumn();
OrcProto.Stream.Kind streamKind = stream.getKind();
// See planIndexReading - only read non-row-index streams if involved in SARGs.
if ((StreamName.getArea(streamKind) == StreamName.Area.INDEX) && ((sargColumns != null && sargColumns[colIx]) || (physicalFileIncludes[colIx] && streamKind == Kind.ROW_INDEX))) {
trace.logAddStream(colIx, streamKind, offset, length, -1, true);
colCtxs[colIx].addStream(offset, stream, -1);
if (isTracingEnabled) {
LOG.trace("Adding stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length);
}
}
offset += length;
}
boolean hasFileId = this.fileKey != null;
// 2. Now, read all of the ranges from cache or disk.
IdentityHashMap<ByteBuffer, Boolean> toRelease = new IdentityHashMap<>();
MutateHelper toRead = getDataFromCacheAndDisk(indexRanges, stripeOffset, hasFileId, toRelease);
// 3. For uncompressed case, we need some special processing before read.
DiskRangeList iter = preReadUncompressedStreams(stripeOffset, colCtxs, toRead, toRelease);
// 4. Decompress the data.
boolean hasError = true;
try {
for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
ReadContext ctx = colCtxs[colIx];
// This column is not included.
if (ctx == null)
continue;
for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
StreamContext sctx = ctx.streams[streamIx];
try {
if (isTracingEnabled) {
LOG.trace("Getting index stream " + sctx.kind + " for column " + ctx.colIx + " at " + sctx.offset + ", " + sctx.length);
}
ColumnStreamData csd = POOLS.csdPool.take();
long endCOffset = sctx.offset + sctx.length;
DiskRangeList lastCached = readEncodedStream(stripeOffset, iter, sctx.offset, endCOffset, csd, endCOffset, sctx.offset, toRelease);
if (lastCached != null) {
iter = lastCached;
}
if (isTracingEnabled) {
traceLogBuffersUsedToParse(csd);
}
CodedInputStream cis = CodedInputStream.newInstance(new IndexStream(csd.getCacheBuffers(), sctx.length));
cis.setSizeLimit(InStream.PROTOBUF_MESSAGE_MAX_LIMIT);
switch(sctx.kind) {
case ROW_INDEX:
OrcProto.RowIndex tmp = index.getRowGroupIndex()[colIx] = OrcProto.RowIndex.parseFrom(cis);
if (isTracingEnabled) {
LOG.trace("Index is " + tmp.toString().replace('\n', ' '));
}
break;
case BLOOM_FILTER:
case BLOOM_FILTER_UTF8:
index.getBloomFilterIndex()[colIx] = OrcProto.BloomFilterIndex.parseFrom(cis);
break;
default:
throw new AssertionError("Unexpected index stream type " + sctx.kind);
}
// We are done with the buffers; unlike data blocks, we are also the consumer. Release.
for (MemoryBuffer buf : csd.getCacheBuffers()) {
if (buf == null)
continue;
cacheWrapper.releaseBuffer(buf);
}
} catch (Exception ex) {
DiskRangeList drl = toRead == null ? null : toRead.next;
LOG.error("Error getting stream " + sctx.kind + " for column " + ctx.colIx + " at " + sctx.offset + ", " + sctx.length + "; toRead " + RecordReaderUtils.stringifyDiskRanges(drl), ex);
throw (ex instanceof IOException) ? (IOException) ex : new IOException(ex);
}
}
}
if (isTracingEnabled) {
LOG.trace("Disk ranges after preparing all the data " + RecordReaderUtils.stringifyDiskRanges(toRead.next));
}
hasError = false;
} finally {
// Release the unreleased buffers. See class comment about refcounts.
try {
releaseInitialRefcounts(toRead.next);
releaseBuffers(toRelease.keySet(), true);
} catch (Throwable t) {
if (!hasError)
throw new IOException(t);
LOG.error("Error during the cleanup after another error; ignoring", t);
}
}
}
use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.
the class EncodedReaderImpl method readEncodedColumns.
@Override
public void readEncodedColumns(int stripeIx, StripeInformation stripe, OrcProto.RowIndex[] indexes, List<OrcProto.ColumnEncoding> encodings, List<OrcProto.Stream> streamList, boolean[] physicalFileIncludes, boolean[] rgs, Consumer<OrcEncodedColumnBatch> consumer) throws IOException {
// Note: for now we don't have to setError here, caller will setError if we throw.
// We are also not supposed to call setDone, since we are only part of the operation.
long stripeOffset = stripe.getOffset();
// 1. Figure out what we have to read.
// Stream offset in relation to the stripe.
long offset = 0;
// 1.1. Figure out which columns have a present stream
boolean[] hasNull = RecordReaderUtils.findPresentStreamsByColumn(streamList, types);
if (isTracingEnabled) {
LOG.trace("The following columns have PRESENT streams: " + arrayToString(hasNull));
}
// We assume stream list is sorted by column and that non-data
// streams do not interleave data streams for the same column.
// 1.2. With that in mind, determine disk ranges to read/get from cache (not by stream).
ColumnReadContext[] colCtxs = new ColumnReadContext[physicalFileIncludes.length];
int colRgIx = -1;
// Don't create context for the 0-s column.
for (int i = 1; i < physicalFileIncludes.length; ++i) {
if (!physicalFileIncludes[i])
continue;
ColumnEncoding enc = encodings.get(i);
colCtxs[i] = new ColumnReadContext(i, enc, indexes[i], ++colRgIx);
if (isTracingEnabled) {
LOG.trace("Creating context: " + colCtxs[i].toString());
}
trace.logColumnRead(i, colRgIx, enc.getKind());
}
CreateHelper listToRead = new CreateHelper();
boolean hasIndexOnlyCols = false;
for (OrcProto.Stream stream : streamList) {
long length = stream.getLength();
int colIx = stream.getColumn();
OrcProto.Stream.Kind streamKind = stream.getKind();
if (!physicalFileIncludes[colIx] || StreamName.getArea(streamKind) != StreamName.Area.DATA) {
// We have a stream for included column, but in future it might have no data streams.
// It's more like "has at least one column included that has an index stream".
hasIndexOnlyCols = hasIndexOnlyCols || physicalFileIncludes[colIx];
if (isTracingEnabled) {
LOG.trace("Skipping stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length);
}
trace.logSkipStream(colIx, streamKind, offset, length);
offset += length;
continue;
}
ColumnReadContext ctx = colCtxs[colIx];
assert ctx != null;
int indexIx = RecordReaderUtils.getIndexPosition(ctx.encoding.getKind(), types.get(colIx).getKind(), streamKind, isCompressed, hasNull[colIx]);
ctx.addStream(offset, stream, indexIx);
if (isTracingEnabled) {
LOG.trace("Adding stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length + ", index position " + indexIx);
}
if (rgs == null || RecordReaderUtils.isDictionary(streamKind, encodings.get(colIx))) {
trace.logAddStream(colIx, streamKind, offset, length, indexIx, true);
RecordReaderUtils.addEntireStreamToRanges(offset, length, listToRead, true);
if (isTracingEnabled) {
LOG.trace("Will read whole stream " + streamKind + "; added to " + listToRead.getTail());
}
} else {
trace.logAddStream(colIx, streamKind, offset, length, indexIx, false);
RecordReaderUtils.addRgFilteredStreamToRanges(stream, rgs, isCompressed, indexes[colIx], encodings.get(colIx), types.get(colIx), bufferSize, hasNull[colIx], offset, length, listToRead, true);
}
offset += length;
}
boolean hasFileId = this.fileKey != null;
if (listToRead.get() == null) {
// TODO: there may be a bug here. Could there be partial RG filtering on index-only column?
if (hasIndexOnlyCols && (rgs == null)) {
OrcEncodedColumnBatch ecb = POOLS.ecbPool.take();
ecb.init(fileKey, stripeIx, OrcEncodedColumnBatch.ALL_RGS, physicalFileIncludes.length);
try {
consumer.consumeData(ecb);
} catch (InterruptedException e) {
LOG.error("IO thread interrupted while queueing data");
throw new IOException(e);
}
} else {
LOG.warn("Nothing to read for stripe [" + stripe + "]");
}
return;
}
// 2. Now, read all of the ranges from cache or disk.
IdentityHashMap<ByteBuffer, Boolean> toRelease = new IdentityHashMap<>();
MutateHelper toRead = getDataFromCacheAndDisk(listToRead.get(), stripeOffset, hasFileId, toRelease);
// 3. For uncompressed case, we need some special processing before read.
// Basically, we are trying to create artificial, consistent ranges to cache, as there are
// no CBs in an uncompressed file. At the end of this processing, the list would contain
// either cache buffers, or buffers allocated by us and not cached (if we are only reading
// parts of the data for some ranges and don't want to cache it). Both are represented by
// CacheChunks, so the list is just CacheChunk-s from that point on.
DiskRangeList iter = preReadUncompressedStreams(stripeOffset, colCtxs, toRead, toRelease);
// 4. Finally, decompress data, map per RG, and return to caller.
// We go by RG and not by column because that is how data is processed.
boolean hasError = true;
try {
int rgCount = (int) Math.ceil((double) stripe.getNumberOfRows() / rowIndexStride);
for (int rgIx = 0; rgIx < rgCount; ++rgIx) {
if (rgs != null && !rgs[rgIx]) {
// RG filtered.
continue;
}
boolean isLastRg = rgIx == rgCount - 1;
// Create the batch we will use to return data for this RG.
OrcEncodedColumnBatch ecb = POOLS.ecbPool.take();
trace.logStartRg(rgIx);
boolean hasErrorForEcb = true;
try {
ecb.init(fileKey, stripeIx, rgIx, physicalFileIncludes.length);
for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
ColumnReadContext ctx = colCtxs[colIx];
// This column is not included.
if (ctx == null)
continue;
if (isTracingEnabled) {
LOG.trace("ctx: {} rgIx: {} isLastRg: {} rgCount: {}", ctx, rgIx, isLastRg, rgCount);
}
OrcProto.RowIndexEntry index = ctx.rowIndex.getEntry(rgIx), nextIndex = isLastRg ? null : ctx.rowIndex.getEntry(rgIx + 1);
ecb.initOrcColumn(ctx.colIx);
trace.logStartCol(ctx.colIx);
for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
StreamContext sctx = ctx.streams[streamIx];
ColumnStreamData cb = null;
try {
if (RecordReaderUtils.isDictionary(sctx.kind, ctx.encoding)) {
// This stream is for entire stripe and needed for every RG; uncompress once and reuse.
if (isTracingEnabled) {
LOG.trace("Getting stripe-level stream [" + sctx.kind + ", " + ctx.encoding + "] for" + " column " + ctx.colIx + " RG " + rgIx + " at " + sctx.offset + ", " + sctx.length);
}
trace.logStartStripeStream(sctx.kind);
if (sctx.stripeLevelStream == null) {
sctx.stripeLevelStream = POOLS.csdPool.take();
// We will be using this for each RG while also sending RGs to processing.
// To avoid buffers being unlocked, run refcount one ahead; so each RG
// processing will decref once, and the last one will unlock the buffers.
sctx.stripeLevelStream.incRef();
// For stripe-level streams we don't need the extra refcount on the block.
// See class comment about refcounts.
long unlockUntilCOffset = sctx.offset + sctx.length;
DiskRangeList lastCached = readEncodedStream(stripeOffset, iter, sctx.offset, sctx.offset + sctx.length, sctx.stripeLevelStream, unlockUntilCOffset, sctx.offset, toRelease);
if (lastCached != null) {
iter = lastCached;
}
}
sctx.stripeLevelStream.incRef();
cb = sctx.stripeLevelStream;
} else {
// This stream can be separated by RG using index. Let's do that.
// Offset to where this RG begins.
long cOffset = sctx.offset + index.getPositions(sctx.streamIndexOffset);
// Offset relative to the beginning of the stream of where this RG ends.
long nextCOffsetRel = isLastRg ? sctx.length : nextIndex.getPositions(sctx.streamIndexOffset);
// Offset before which this RG is guaranteed to end. Can only be estimated.
// We estimate the same way for compressed and uncompressed for now.
long endCOffset = sctx.offset + RecordReaderUtils.estimateRgEndOffset(isCompressed, isLastRg, nextCOffsetRel, sctx.length, bufferSize);
// As we read, we can unlock initial refcounts for the buffers that end before
// the data that we need for this RG.
long unlockUntilCOffset = sctx.offset + nextCOffsetRel;
cb = createRgColumnStreamData(rgIx, isLastRg, ctx.colIx, sctx, cOffset, endCOffset, isCompressed, unlockUntilCOffset);
boolean isStartOfStream = sctx.bufferIter == null;
DiskRangeList lastCached = readEncodedStream(stripeOffset, (isStartOfStream ? iter : sctx.bufferIter), cOffset, endCOffset, cb, unlockUntilCOffset, sctx.offset, toRelease);
if (lastCached != null) {
sctx.bufferIter = iter = lastCached;
}
}
ecb.setStreamData(ctx.colIx, sctx.kind.getNumber(), cb);
} catch (Exception ex) {
DiskRangeList drl = toRead == null ? null : toRead.next;
LOG.error("Error getting stream [" + sctx.kind + ", " + ctx.encoding + "] for" + " column " + ctx.colIx + " RG " + rgIx + " at " + sctx.offset + ", " + sctx.length + "; toRead " + RecordReaderUtils.stringifyDiskRanges(drl), ex);
throw (ex instanceof IOException) ? (IOException) ex : new IOException(ex);
}
}
}
hasErrorForEcb = false;
} finally {
if (hasErrorForEcb) {
releaseEcbRefCountsOnError(ecb);
}
}
try {
consumer.consumeData(ecb);
// After this, the non-initial refcounts are the responsibility of the consumer.
} catch (InterruptedException e) {
LOG.error("IO thread interrupted while queueing data");
releaseEcbRefCountsOnError(ecb);
throw new IOException(e);
}
}
if (isTracingEnabled) {
LOG.trace("Disk ranges after preparing all the data " + RecordReaderUtils.stringifyDiskRanges(toRead.next));
}
trace.logRanges(fileKey, stripeOffset, toRead.next, RangesSrc.PREREAD);
hasError = false;
} finally {
try {
// Release the unreleased stripe-level buffers. See class comment about refcounts.
for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
ColumnReadContext ctx = colCtxs[colIx];
// This column is not included.
if (ctx == null)
continue;
for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
StreamContext sctx = ctx.streams[streamIx];
if (sctx == null || sctx.stripeLevelStream == null)
continue;
if (0 != sctx.stripeLevelStream.decRef())
continue;
// essentially the "consumer" refcount being released here.
for (MemoryBuffer buf : sctx.stripeLevelStream.getCacheBuffers()) {
if (LOG.isTraceEnabled()) {
LOG.trace("Unlocking {} at the end of processing", buf);
}
cacheWrapper.releaseBuffer(buf);
}
}
}
releaseInitialRefcounts(toRead.next);
// Release buffers as we are done with all the streams... also see toRelease comment.
releaseBuffers(toRelease.keySet(), true);
} catch (Throwable t) {
if (!hasError)
throw new IOException(t);
LOG.error("Error during the cleanup after another error; ignoring", t);
}
}
}
use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.
the class EncodedReaderImpl method processCacheCollisions.
private void processCacheCollisions(long[] collisionMask, List<? extends CacheChunk> toDecompress, MemoryBuffer[] targetBuffers, List<MemoryBuffer> cacheBuffers) {
if (collisionMask == null)
return;
assert collisionMask.length >= (toDecompress.size() >>> 6);
// There are some elements that were cached in parallel, take care of them.
long maskVal = -1;
for (int i = 0; i < toDecompress.size(); ++i) {
if ((i & 63) == 0) {
maskVal = collisionMask[i >>> 6];
}
if ((maskVal & 1) == 1) {
// Cache has found an old buffer for the key and put it into array instead of our new one.
CacheChunk replacedChunk = toDecompress.get(i);
MemoryBuffer replacementBuffer = targetBuffers[i];
if (isTracingEnabled) {
LOG.trace("Discarding data due to cache collision: " + replacedChunk.getBuffer() + " replaced with " + replacementBuffer);
}
trace.logCacheCollision(replacedChunk, replacementBuffer);
assert replacedChunk.getBuffer() != replacementBuffer : i + " was not replaced in the results " + "even though mask is [" + Long.toBinaryString(maskVal) + "]";
replacedChunk.handleCacheCollision(cacheWrapper, replacementBuffer, cacheBuffers);
}
maskVal >>= 1;
}
}
use of org.apache.hadoop.hive.common.io.encoded.MemoryBuffer in project hive by apache.
the class EncodedReaderImpl method traceLogBuffersUsedToParse.
private void traceLogBuffersUsedToParse(ColumnStreamData csd) {
String s = "Buffers ";
if (csd.getCacheBuffers() != null) {
for (MemoryBuffer buf : csd.getCacheBuffers()) {
ByteBuffer bb = buf.getByteBufferDup();
s += "{" + buf + ", " + bb.remaining() + /* " => " + bb.hashCode() + */
"}, ";
}
}
LOG.trace(s);
}
Aggregations