use of org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper in project hive by apache.
the class LowLevelCacheImpl method getFileData.
@Override
public DiskRangeList getFileData(Object fileKey, DiskRangeList ranges, long baseOffset, DiskRangeListFactory factory, LowLevelCacheCounters qfCounters, BooleanRef gotAllData) {
if (ranges == null)
return null;
DiskRangeList prev = ranges.prev;
FileCache<ConcurrentSkipListMap<Long, LlapDataBuffer>> subCache = cache.get(fileKey);
if (subCache == null || !subCache.incRef()) {
long totalMissed = ranges.getTotalLength();
metrics.incrCacheRequestedBytes(totalMissed);
if (qfCounters != null) {
qfCounters.recordCacheMiss(totalMissed);
}
if (prev != null && gotAllData != null) {
gotAllData.value = false;
}
return ranges;
}
try {
if (prev == null) {
prev = new MutateHelper(ranges);
}
if (gotAllData != null) {
gotAllData.value = true;
}
DiskRangeList current = ranges;
while (current != null) {
metrics.incrCacheRequestedBytes(current.getLength());
// We assume ranges in "ranges" are non-overlapping; thus, we will save next in advance.
DiskRangeList next = current.next;
getOverlappingRanges(baseOffset, current, subCache.getCache(), factory, gotAllData);
current = next;
}
} finally {
subCache.decRef();
}
boolean isInvalid = false;
if (qfCounters != null) {
DiskRangeList current = prev.next;
long bytesHit = 0, bytesMissed = 0;
while (current != null) {
// This assumes no ranges passed to cache to fetch have data beforehand.
if (current.hasData()) {
bytesHit += current.getLength();
} else {
if (gotAllData.value) {
isInvalid = true;
}
bytesMissed += current.getLength();
}
current = current.next;
}
qfCounters.recordCacheHit(bytesHit);
qfCounters.recordCacheMiss(bytesMissed);
} else if (gotAllData != null && gotAllData.value) {
DiskRangeList current = prev.next;
while (current != null) {
if (!current.hasData()) {
isInvalid = true;
break;
}
current = current.next;
}
}
if (isInvalid) {
StringBuilder invalidMsg = new StringBuilder("Internal error - gotAllData=true but the resulting ranges are ").append(RecordReaderUtils.stringifyDiskRanges(prev.next));
subCache = cache.get(fileKey);
if (subCache != null && subCache.incRef()) {
try {
invalidMsg.append("; cache ranges (not necessarily consistent) are ");
for (Map.Entry<Long, LlapDataBuffer> e : subCache.getCache().entrySet()) {
long start = e.getKey(), end = start + e.getValue().declaredCachedLength;
invalidMsg.append("[").append(start).append(", ").append(end).append("), ");
}
} finally {
subCache.decRef();
}
} else {
invalidMsg.append("; cache ranges can no longer be determined");
}
String s = invalidMsg.toString();
LlapIoImpl.LOG.error(s);
throw new RuntimeException(s);
}
return prev.next;
}
use of org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper in project hive by apache.
the class EncodedReaderImpl method preReadDataRanges.
@Override
public void preReadDataRanges(DiskRangeList ranges) throws IOException {
boolean hasFileId = this.fileKey != null;
long baseOffset = 0L;
// 2. Now, read all of the ranges from cache or disk.
IdentityHashMap<ByteBuffer, Boolean> toRelease = new IdentityHashMap<>();
MutateHelper toRead = getDataFromCacheAndDisk(ranges, 0, hasFileId, toRelease);
// 3. For uncompressed case, we need some special processing before read.
preReadUncompressedStreams(baseOffset, toRead, toRelease);
// 4. Decompress the data.
ColumnStreamData csd = POOLS.csdPool.take();
try {
csd.incRef();
DiskRangeList drl = toRead.next;
while (drl != null) {
drl = readEncodedStream(baseOffset, drl, drl.getOffset(), drl.getEnd(), csd, drl.getOffset(), drl.getEnd(), toRelease);
for (MemoryBuffer buf : csd.getCacheBuffers()) {
cacheWrapper.releaseBuffer(buf);
}
if (drl != null)
drl = drl.next;
}
} finally {
if (toRead != null) {
releaseInitialRefcounts(toRead.next);
}
if (toRelease != null) {
releaseBuffers(toRelease.keySet(), true);
toRelease.clear();
}
if (csd != null) {
csd.decRef();
POOLS.csdPool.offer(csd);
}
}
}
use of org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper in project hive by apache.
the class OrcFileEstimateErrors method getIncompleteCbs.
public DiskRangeList getIncompleteCbs(DiskRangeList ranges, long baseOffset, DiskRangeListFactory factory, BooleanRef gotAllData) {
DiskRangeList prev = ranges.prev;
if (prev == null) {
prev = new MutateHelper(ranges);
}
DiskRangeList current = ranges;
// Assume by default that we would find everything.
gotAllData.value = true;
while (current != null) {
// We assume ranges in "ranges" are non-overlapping; thus, we will save next in advance.
DiskRangeList check = current;
current = current.next;
if (check.hasData())
continue;
Integer badLength = cache.get(Long.valueOf(check.getOffset() + baseOffset));
if (badLength == null || badLength < check.getLength()) {
gotAllData.value = false;
continue;
}
// We could just remove here and handle the missing tail during read, but that can be
// dangerous; let's explicitly add an incomplete CB.
check.replaceSelfWith(new IncompleteCb(check.getOffset(), check.getEnd()));
}
return prev.next;
}
use of org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper in project hive by apache.
the class EncodedReaderImpl method readEncodedColumns.
@Override
public void readEncodedColumns(int stripeIx, StripeInformation stripe, OrcProto.RowIndex[] indexes, List<OrcProto.ColumnEncoding> encodings, List<OrcProto.Stream> streamList, boolean[] physicalFileIncludes, boolean[] rgs, Consumer<OrcEncodedColumnBatch> consumer) throws IOException {
// Note: for now we don't have to setError here, caller will setError if we throw.
// We are also not supposed to call setDone, since we are only part of the operation.
long stripeOffset = stripe.getOffset();
// 1. Figure out what we have to read.
// Stream offset in relation to the stripe.
long offset = 0;
// 1.1. Figure out which columns have a present stream
boolean[] hasNull = findPresentStreamsByColumn(streamList, types);
if (isTracingEnabled) {
LOG.trace("The following columns have PRESENT streams: " + arrayToString(hasNull));
}
// We assume stream list is sorted by column and that non-data
// streams do not interleave data streams for the same column.
// 1.2. With that in mind, determine disk ranges to read/get from cache (not by stream).
ColumnReadContext[] colCtxs = new ColumnReadContext[physicalFileIncludes.length];
int colRgIx = -1;
// Don't create context for the 0-s column.
for (int i = 1; i < physicalFileIncludes.length; ++i) {
if (!physicalFileIncludes[i])
continue;
ColumnEncoding enc = encodings.get(i);
colCtxs[i] = new ColumnReadContext(i, enc, indexes[i], ++colRgIx);
if (isTracingEnabled) {
LOG.trace("Creating context: " + colCtxs[i].toString());
}
trace.logColumnRead(i, colRgIx, enc.getKind());
}
CreateHelper listToRead = new CreateHelper();
boolean hasIndexOnlyCols = false, hasAnyNonData = false;
for (OrcProto.Stream stream : streamList) {
long length = stream.getLength();
int colIx = stream.getColumn();
OrcProto.Stream.Kind streamKind = stream.getKind();
boolean isIndexCol = StreamName.getArea(streamKind) != StreamName.Area.DATA;
hasAnyNonData = hasAnyNonData || isIndexCol;
// We have a stream for included column, but in future it might have no data streams.
// It's more like "has at least one column included that has an index stream".
hasIndexOnlyCols = hasIndexOnlyCols || (isIndexCol && physicalFileIncludes[colIx]);
if (!physicalFileIncludes[colIx] || isIndexCol) {
if (isTracingEnabled) {
LOG.trace("Skipping stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length);
}
trace.logSkipStream(colIx, streamKind, offset, length);
offset += length;
continue;
}
ColumnReadContext ctx = colCtxs[colIx];
assert ctx != null;
int indexIx = RecordReaderUtils.getIndexPosition(ctx.encoding.getKind(), fileSchema.findSubtype(colIx).getCategory(), streamKind, isCompressed, hasNull[colIx]);
ctx.addStream(offset, stream, indexIx);
if (isTracingEnabled) {
LOG.trace("Adding stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length + ", index position " + indexIx);
}
if (rgs == null || RecordReaderUtils.isDictionary(streamKind, encodings.get(colIx))) {
trace.logAddStream(colIx, streamKind, offset, length, indexIx, true);
addEntireStreamToRanges(offset, length, listToRead, true);
if (isTracingEnabled) {
LOG.trace("Will read whole stream " + streamKind + "; added to " + listToRead.getTail());
}
} else {
trace.logAddStream(colIx, streamKind, offset, length, indexIx, false);
addRgFilteredStreamToRanges(stream, rgs, isCompressed, indexes[colIx], encodings.get(colIx), fileSchema.findSubtype(colIx).getCategory(), bufferSize, hasNull[colIx], offset, length, listToRead, true);
}
offset += length;
}
boolean hasFileId = this.fileKey != null;
if (listToRead.get() == null) {
// No data to read for this stripe. Check if we have some included index-only columns.
// For example, count(1) would have the root column, that has no data stream, included.
// It may also happen that we have a column included with no streams whatsoever. That
// should only be possible if the file has no index streams.
boolean hasAnyIncludes = false;
if (!hasIndexOnlyCols) {
for (int i = 0; i < physicalFileIncludes.length; ++i) {
if (!physicalFileIncludes[i])
continue;
hasAnyIncludes = true;
break;
}
}
boolean nonProjectionRead = hasIndexOnlyCols || (!hasAnyNonData && hasAnyIncludes);
// We should probably just disable filtering for such cases if they exist.
if (nonProjectionRead && (rgs == SargApplier.READ_ALL_RGS)) {
OrcEncodedColumnBatch ecb = POOLS.ecbPool.take();
ecb.init(fileKey, stripeIx, OrcEncodedColumnBatch.ALL_RGS, physicalFileIncludes.length);
try {
consumer.consumeData(ecb);
} catch (InterruptedException e) {
LOG.error("IO thread interrupted while queueing data");
throw new IOException(e);
}
} else {
LOG.warn("Nothing to read for stripe [" + stripe + "]");
}
return;
}
// 2. Now, read all of the ranges from cache or disk.
IdentityHashMap<ByteBuffer, Boolean> toRelease = new IdentityHashMap<>();
MutateHelper toRead = getDataFromCacheAndDisk(listToRead.get(), stripeOffset, hasFileId, toRelease);
// 3. For uncompressed case, we need some special processing before read.
// Basically, we are trying to create artificial, consistent ranges to cache, as there are
// no CBs in an uncompressed file. At the end of this processing, the list would contain
// either cache buffers, or buffers allocated by us and not cached (if we are only reading
// parts of the data for some ranges and don't want to cache it). Both are represented by
// CacheChunks, so the list is just CacheChunk-s from that point on.
DiskRangeList iter = preReadUncompressedStreams(stripeOffset, colCtxs, toRead, toRelease);
// 4. Finally, decompress data, map per RG, and return to caller.
// We go by RG and not by column because that is how data is processed.
boolean hasError = true;
try {
int rgCount = rowIndexStride == 0 ? 1 : (int) Math.ceil((double) stripe.getNumberOfRows() / rowIndexStride);
for (int rgIx = 0; rgIx < rgCount; ++rgIx) {
if (rgs != null && !rgs[rgIx]) {
// RG filtered.
continue;
}
boolean isLastRg = rgIx == rgCount - 1;
// Create the batch we will use to return data for this RG.
OrcEncodedColumnBatch ecb = POOLS.ecbPool.take();
trace.logStartRg(rgIx);
boolean hasErrorForEcb = true;
try {
ecb.init(fileKey, stripeIx, rgIx, physicalFileIncludes.length);
for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
ColumnReadContext ctx = colCtxs[colIx];
// This column is not included
if (ctx == null)
continue;
OrcProto.RowIndexEntry index;
OrcProto.RowIndexEntry nextIndex;
// index is disabled
if (ctx.rowIndex == null) {
if (isTracingEnabled) {
LOG.trace("Row index is null. Likely reading a file with indexes disabled.");
}
index = null;
nextIndex = null;
} else {
index = ctx.rowIndex.getEntry(rgIx);
nextIndex = isLastRg ? null : ctx.rowIndex.getEntry(rgIx + 1);
}
if (isTracingEnabled) {
LOG.trace("ctx: {} rgIx: {} isLastRg: {} rgCount: {}", ctx, rgIx, isLastRg, rgCount);
}
ecb.initOrcColumn(ctx.colIx);
trace.logStartCol(ctx.colIx);
for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
StreamContext sctx = ctx.streams[streamIx];
ColumnStreamData cb = null;
try {
if (RecordReaderUtils.isDictionary(sctx.kind, ctx.encoding) || index == null) {
// This stream is for entire stripe and needed for every RG; uncompress once and reuse.
if (sctx.stripeLevelStream == null) {
if (isTracingEnabled) {
LOG.trace("Getting stripe-level stream [" + sctx.kind + ", " + ctx.encoding + "] for" + " column " + ctx.colIx + " RG " + rgIx + " at " + sctx.offset + ", " + sctx.length);
}
trace.logStartStripeStream(sctx.kind);
sctx.stripeLevelStream = POOLS.csdPool.take();
// We will be using this for each RG while also sending RGs to processing.
// To avoid buffers being unlocked, run refcount one ahead; so each RG
// processing will decref once, and the last one will unlock the buffers.
sctx.stripeLevelStream.incRef();
// For stripe-level streams we don't need the extra refcount on the block.
// See class comment about refcounts.
long unlockUntilCOffset = sctx.offset + sctx.length;
DiskRangeList lastCached = readEncodedStream(stripeOffset, iter, sctx.offset, sctx.offset + sctx.length, sctx.stripeLevelStream, unlockUntilCOffset, sctx.offset, toRelease);
if (lastCached != null) {
iter = lastCached;
}
}
sctx.stripeLevelStream.incRef();
cb = sctx.stripeLevelStream;
} else {
// This stream can be separated by RG using index. Let's do that.
// Offset to where this RG begins.
long cOffset = sctx.offset + index.getPositions(sctx.streamIndexOffset);
// Offset relative to the beginning of the stream of where this RG ends.
long nextCOffsetRel = isLastRg ? sctx.length : nextIndex.getPositions(sctx.streamIndexOffset);
// Offset before which this RG is guaranteed to end. Can only be estimated.
// We estimate the same way for compressed and uncompressed for now.
long endCOffset = sctx.offset + estimateRgEndOffset(isCompressed, isLastRg, nextCOffsetRel, sctx.length, bufferSize);
// As we read, we can unlock initial refcounts for the buffers that end before
// the data that we need for this RG.
long unlockUntilCOffset = sctx.offset + nextCOffsetRel;
cb = createRgColumnStreamData(rgIx, isLastRg, ctx.colIx, sctx, cOffset, endCOffset, isCompressed, unlockUntilCOffset);
boolean isStartOfStream = sctx.bufferIter == null;
DiskRangeList lastCached = readEncodedStream(stripeOffset, (isStartOfStream ? iter : sctx.bufferIter), cOffset, endCOffset, cb, unlockUntilCOffset, sctx.offset, toRelease);
if (lastCached != null) {
sctx.bufferIter = iter = lastCached;
}
}
} catch (Exception ex) {
DiskRangeList drl = toRead == null ? null : toRead.next;
LOG.error("Error getting stream [" + sctx.kind + ", " + ctx.encoding + "] for" + " column " + ctx.colIx + " RG " + rgIx + " at " + sctx.offset + ", " + sctx.length + "; toRead " + RecordReaderUtils.stringifyDiskRanges(drl), ex);
throw (ex instanceof IOException) ? (IOException) ex : new IOException(ex);
} finally {
// Otherwise, we won't release consumer refcounts for a partially read stream.
if (cb != null) {
ecb.setStreamData(ctx.colIx, sctx.kind.getNumber(), cb);
}
}
}
}
hasErrorForEcb = false;
} finally {
if (hasErrorForEcb) {
releaseEcbRefCountsOnError(ecb);
}
}
try {
consumer.consumeData(ecb);
// After this, the non-initial refcounts are the responsibility of the consumer.
} catch (InterruptedException e) {
LOG.error("IO thread interrupted while queueing data");
releaseEcbRefCountsOnError(ecb);
throw new IOException(e);
}
}
if (isTracingEnabled) {
LOG.trace("Disk ranges after preparing all the data " + RecordReaderUtils.stringifyDiskRanges(toRead.next));
}
trace.logRanges(fileKey, stripeOffset, toRead.next, RangesSrc.PREREAD);
hasError = false;
} finally {
try {
// Release the unreleased stripe-level buffers. See class comment about refcounts.
for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
ColumnReadContext ctx = colCtxs[colIx];
// This column is not included.
if (ctx == null)
continue;
for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
StreamContext sctx = ctx.streams[streamIx];
if (sctx == null || sctx.stripeLevelStream == null)
continue;
if (0 != sctx.stripeLevelStream.decRef())
continue;
// essentially the "consumer" refcount being released here.
for (MemoryBuffer buf : sctx.stripeLevelStream.getCacheBuffers()) {
LOG.trace("Unlocking {} at the end of processing", buf);
cacheWrapper.releaseBuffer(buf);
}
}
}
releaseInitialRefcounts(toRead.next);
// Release buffers as we are done with all the streams... also see toRelease comment.
releaseBuffers(toRelease.keySet(), true);
} catch (Throwable t) {
if (!hasError)
throw new IOException(t);
LOG.error("Error during the cleanup after another error; ignoring", t);
}
}
}
use of org.apache.hadoop.hive.common.io.DiskRangeList.MutateHelper in project hive by apache.
the class EncodedReaderImpl method readIndexStreams.
@Override
public void readIndexStreams(OrcIndex index, StripeInformation stripe, List<OrcProto.Stream> streams, boolean[] physicalFileIncludes, boolean[] sargColumns) throws IOException {
long stripeOffset = stripe.getOffset();
DiskRangeList indexRanges = planIndexReading(fileSchema, streams, true, physicalFileIncludes, sargColumns, version, index.getBloomFilterKinds());
if (indexRanges == null) {
LOG.debug("Nothing to read for stripe [{}]", stripe);
return;
}
ReadContext[] colCtxs = new ReadContext[physicalFileIncludes.length];
int colRgIx = -1;
for (int i = 0; i < physicalFileIncludes.length; ++i) {
if (!physicalFileIncludes[i] && (sargColumns == null || !sargColumns[i]))
continue;
colCtxs[i] = new ReadContext(i, ++colRgIx);
if (isTracingEnabled) {
LOG.trace("Creating context: " + colCtxs[i].toString());
}
// Bogus encoding.
trace.logColumnRead(i, colRgIx, ColumnEncoding.Kind.DIRECT);
}
long offset = 0;
for (OrcProto.Stream stream : streams) {
long length = stream.getLength();
int colIx = stream.getColumn();
OrcProto.Stream.Kind streamKind = stream.getKind();
// See planIndexReading - only read non-row-index streams if involved in SARGs.
if ((StreamName.getArea(streamKind) == StreamName.Area.INDEX) && ((sargColumns != null && sargColumns[colIx]) || (physicalFileIncludes[colIx] && streamKind == Kind.ROW_INDEX))) {
trace.logAddStream(colIx, streamKind, offset, length, -1, true);
colCtxs[colIx].addStream(offset, stream, -1);
if (isTracingEnabled) {
LOG.trace("Adding stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length);
}
}
offset += length;
}
boolean hasFileId = this.fileKey != null;
// 2. Now, read all of the ranges from cache or disk.
IdentityHashMap<ByteBuffer, Boolean> toRelease = new IdentityHashMap<>();
MutateHelper toRead = getDataFromCacheAndDisk(indexRanges, stripeOffset, hasFileId, toRelease);
// 3. For uncompressed case, we need some special processing before read.
DiskRangeList iter = preReadUncompressedStreams(stripeOffset, colCtxs, toRead, toRelease);
// 4. Decompress the data.
boolean hasError = true;
try {
for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
ReadContext ctx = colCtxs[colIx];
// This column is not included.
if (ctx == null)
continue;
for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
StreamContext sctx = ctx.streams[streamIx];
try {
if (isTracingEnabled) {
LOG.trace("Getting index stream " + sctx.kind + " for column " + ctx.colIx + " at " + sctx.offset + ", " + sctx.length);
}
ColumnStreamData csd = POOLS.csdPool.take();
long endCOffset = sctx.offset + sctx.length;
DiskRangeList lastCached = readEncodedStream(stripeOffset, iter, sctx.offset, endCOffset, csd, endCOffset, sctx.offset, toRelease);
if (lastCached != null) {
iter = lastCached;
}
if (isTracingEnabled) {
traceLogBuffersUsedToParse(csd);
}
CodedInputStream cis = CodedInputStream.newInstance(new IndexStream(csd.getCacheBuffers(), sctx.length));
cis.setSizeLimit(InStream.PROTOBUF_MESSAGE_MAX_LIMIT);
switch(sctx.kind) {
case ROW_INDEX:
OrcProto.RowIndex tmp = index.getRowGroupIndex()[colIx] = OrcProto.RowIndex.parseFrom(cis);
if (isTracingEnabled) {
LOG.trace("Index is " + tmp.toString().replace('\n', ' '));
}
break;
case BLOOM_FILTER:
case BLOOM_FILTER_UTF8:
index.getBloomFilterIndex()[colIx] = OrcProto.BloomFilterIndex.parseFrom(cis);
break;
default:
throw new AssertionError("Unexpected index stream type " + sctx.kind);
}
// We are done with the buffers; unlike data blocks, we are also the consumer. Release.
for (MemoryBuffer buf : csd.getCacheBuffers()) {
if (buf == null)
continue;
cacheWrapper.releaseBuffer(buf);
}
} catch (Exception ex) {
DiskRangeList drl = toRead == null ? null : toRead.next;
LOG.error("Error getting stream " + sctx.kind + " for column " + ctx.colIx + " at " + sctx.offset + ", " + sctx.length + "; toRead " + RecordReaderUtils.stringifyDiskRanges(drl), ex);
throw (ex instanceof IOException) ? (IOException) ex : new IOException(ex);
}
}
}
if (isTracingEnabled) {
LOG.trace("Disk ranges after preparing all the data " + RecordReaderUtils.stringifyDiskRanges(toRead.next));
}
hasError = false;
} finally {
// Release the unreleased buffers. See class comment about refcounts.
try {
if (toRead != null) {
releaseInitialRefcounts(toRead.next);
}
releaseBuffers(toRelease.keySet(), true);
} catch (Throwable t) {
if (!hasError)
throw new IOException(t);
LOG.error("Error during the cleanup after another error; ignoring", t);
}
}
}
Aggregations