use of org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData in project hive by apache.
the class SerDeEncodedDataReader method processColumnCacheData.
private void processColumnCacheData(LlapDataBuffer[][][] cacheBuffers, OrcEncodedColumnBatch ecb, int colIx) {
// The column has been obtained from cache.
LlapDataBuffer[][] colData = cacheBuffers[colIx];
if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
LlapIoImpl.CACHE_LOGGER.trace("Processing cache data for column " + colIx + ": " + SerDeLowLevelCacheImpl.toString(colData));
}
for (int streamIx = 0; streamIx < colData.length; ++streamIx) {
if (colData[streamIx] == null)
continue;
ColumnStreamData cb = CSD_POOL.take();
cb.incRef();
cb.setCacheBuffers(Lists.<MemoryBuffer>newArrayList(colData[streamIx]));
ecb.setStreamData(colIx, streamIx, cb);
}
}
use of org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData in project hive by apache.
the class EncodedReaderImpl method readEncodedColumns.
@Override
public void readEncodedColumns(int stripeIx, StripeInformation stripe, OrcProto.RowIndex[] indexes, List<OrcProto.ColumnEncoding> encodings, List<OrcProto.Stream> streamList, boolean[] included, boolean[][] colRgs, Consumer<OrcEncodedColumnBatch> consumer) throws IOException {
// Note: for now we don't have to setError here, caller will setError if we throw.
// We are also not supposed to call setDone, since we are only part of the operation.
long stripeOffset = stripe.getOffset();
// 1. Figure out what we have to read.
// Stream offset in relation to the stripe.
long offset = 0;
// 1.1. Figure out which columns have a present stream
boolean[] hasNull = RecordReaderUtils.findPresentStreamsByColumn(streamList, types);
if (isTracingEnabled) {
LOG.trace("The following columns have PRESENT streams: " + arrayToString(hasNull));
}
// We assume stream list is sorted by column and that non-data
// streams do not interleave data streams for the same column.
// 1.2. With that in mind, determine disk ranges to read/get from cache (not by stream).
ColumnReadContext[] colCtxs = new ColumnReadContext[included.length];
int colRgIx = -1;
// Don't create context for the 0-s column.
for (int i = 1; i < included.length; ++i) {
if (!included[i])
continue;
colCtxs[i] = new ColumnReadContext(i, encodings.get(i), indexes[i], ++colRgIx);
if (isTracingEnabled) {
LOG.trace("Creating context: " + colCtxs[i].toString());
}
}
boolean isCompressed = (codec != null);
CreateHelper listToRead = new CreateHelper();
boolean hasIndexOnlyCols = false;
// Will always be the same for all cols at the moment.
boolean[] includedRgs = null;
for (OrcProto.Stream stream : streamList) {
long length = stream.getLength();
int colIx = stream.getColumn();
OrcProto.Stream.Kind streamKind = stream.getKind();
if (!included[colIx] || StreamName.getArea(streamKind) != StreamName.Area.DATA) {
// We have a stream for included column, but in future it might have no data streams.
// It's more like "has at least one column included that has an index stream".
hasIndexOnlyCols = hasIndexOnlyCols || included[colIx];
if (isTracingEnabled) {
LOG.trace("Skipping stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length);
}
offset += length;
continue;
}
ColumnReadContext ctx = colCtxs[colIx];
assert ctx != null;
includedRgs = colRgs[ctx.includedIx];
int indexIx = RecordReaderUtils.getIndexPosition(ctx.encoding.getKind(), types.get(colIx).getKind(), streamKind, isCompressed, hasNull[colIx]);
ctx.addStream(offset, stream, indexIx);
if (isTracingEnabled) {
LOG.trace("Adding stream for column " + colIx + ": " + streamKind + " at " + offset + ", " + length + ", index position " + indexIx);
}
if (includedRgs == null || RecordReaderUtils.isDictionary(streamKind, encodings.get(colIx))) {
RecordReaderUtils.addEntireStreamToRanges(offset, length, listToRead, true);
if (isTracingEnabled) {
LOG.trace("Will read whole stream " + streamKind + "; added to " + listToRead.getTail());
}
} else {
RecordReaderUtils.addRgFilteredStreamToRanges(stream, includedRgs, codec != null, indexes[colIx], encodings.get(colIx), types.get(colIx), bufferSize, hasNull[colIx], offset, length, listToRead, true);
}
offset += length;
}
boolean hasFileId = this.fileKey != null;
if (listToRead.get() == null) {
// TODO: there may be a bug here. Could there be partial RG filtering on index-only column?
if (hasIndexOnlyCols && (includedRgs == null)) {
OrcEncodedColumnBatch ecb = POOLS.ecbPool.take();
ecb.init(fileKey, stripeIx, OrcEncodedColumnBatch.ALL_RGS, included.length);
consumer.consumeData(ecb);
} else {
LOG.warn("Nothing to read for stripe [" + stripe + "]");
}
return;
}
// 2. Now, read all of the ranges from cache or disk.
DiskRangeList.MutateHelper toRead = new DiskRangeList.MutateHelper(listToRead.get());
if (/*isTracingEnabled && */
LOG.isInfoEnabled()) {
LOG.info("Resulting disk ranges to read (file " + fileKey + "): " + RecordReaderUtils.stringifyDiskRanges(toRead.next));
}
BooleanRef isAllInCache = new BooleanRef();
if (hasFileId) {
cacheWrapper.getFileData(fileKey, toRead.next, stripeOffset, CC_FACTORY, isAllInCache);
if (/*isTracingEnabled && */
LOG.isInfoEnabled()) {
LOG.info("Disk ranges after cache (found everything " + isAllInCache.value + "; file " + fileKey + ", base offset " + stripeOffset + "): " + RecordReaderUtils.stringifyDiskRanges(toRead.next));
}
}
if (!isAllInCache.value) {
if (!isDataReaderOpen) {
this.dataReader.open();
isDataReaderOpen = true;
}
dataReader.readFileData(toRead.next, stripeOffset, cacheWrapper.getAllocator().isDirectAlloc());
}
// 3. For uncompressed case, we need some special processing before read.
// Keep "toRead" list for future use, don't extract().
DiskRangeList iter = toRead.next;
if (codec == null) {
for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
ColumnReadContext ctx = colCtxs[colIx];
// This column is not included.
if (ctx == null)
continue;
for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
StreamContext sctx = ctx.streams[streamIx];
DiskRangeList newIter = preReadUncompressedStream(stripeOffset, iter, sctx.offset, sctx.offset + sctx.length);
if (newIter != null) {
iter = newIter;
}
}
}
if (isTracingEnabled) {
LOG.trace("Disk ranges after pre-read (file " + fileKey + ", base offset " + stripeOffset + "): " + RecordReaderUtils.stringifyDiskRanges(toRead.next));
}
// Reset the iter to start.
iter = toRead.next;
}
// 4. Finally, decompress data, map per RG, and return to caller.
// We go by RG and not by column because that is how data is processed.
int rgCount = (int) Math.ceil((double) stripe.getNumberOfRows() / rowIndexStride);
for (int rgIx = 0; rgIx < rgCount; ++rgIx) {
boolean isLastRg = rgIx == rgCount - 1;
// Create the batch we will use to return data for this RG.
OrcEncodedColumnBatch ecb = POOLS.ecbPool.take();
ecb.init(fileKey, stripeIx, rgIx, included.length);
boolean isRGSelected = true;
for (int colIx = 0; colIx < colCtxs.length; ++colIx) {
ColumnReadContext ctx = colCtxs[colIx];
// This column is not included.
if (ctx == null)
continue;
if (isTracingEnabled) {
LOG.trace("ctx: {} rgIx: {} isLastRg: {} rgCount: {}", ctx, rgIx, isLastRg, rgCount);
}
// TODO: simplify this now that high-level cache has been removed. Same RGs for all cols.
if (colRgs[ctx.includedIx] != null && !colRgs[ctx.includedIx][rgIx]) {
// RG x col filtered.
isRGSelected = false;
if (isTracingEnabled) {
LOG.trace("colIxMod: {} rgIx: {} colRgs[{}]: {} colRgs[{}][{}]: {}", ctx.includedIx, rgIx, ctx.includedIx, Arrays.toString(colRgs[ctx.includedIx]), ctx.includedIx, rgIx, colRgs[ctx.includedIx][rgIx]);
}
continue;
}
OrcProto.RowIndexEntry index = ctx.rowIndex.getEntry(rgIx), nextIndex = isLastRg ? null : ctx.rowIndex.getEntry(rgIx + 1);
ecb.initOrcColumn(ctx.colIx);
for (int streamIx = 0; streamIx < ctx.streamCount; ++streamIx) {
StreamContext sctx = ctx.streams[streamIx];
ColumnStreamData cb = null;
try {
if (RecordReaderUtils.isDictionary(sctx.kind, ctx.encoding)) {
// This stream is for entire stripe and needed for every RG; uncompress once and reuse.
if (isTracingEnabled) {
LOG.trace("Getting stripe-level stream [" + sctx.kind + ", " + ctx.encoding + "] for" + " column " + ctx.colIx + " RG " + rgIx + " at " + sctx.offset + ", " + sctx.length);
}
if (sctx.stripeLevelStream == null) {
sctx.stripeLevelStream = POOLS.csdPool.take();
// We will be using this for each RG while also sending RGs to processing.
// To avoid buffers being unlocked, run refcount one ahead; we will not increase
// it when building the last RG, so each RG processing will decref once, and the
// last one will unlock the buffers.
sctx.stripeLevelStream.incRef();
// For stripe-level streams we don't need the extra refcount on the block.
// See class comment about refcounts.
long unlockUntilCOffset = sctx.offset + sctx.length;
DiskRangeList lastCached = readEncodedStream(stripeOffset, iter, sctx.offset, sctx.offset + sctx.length, sctx.stripeLevelStream, unlockUntilCOffset, sctx.offset);
if (lastCached != null) {
iter = lastCached;
}
}
if (!isLastRg) {
sctx.stripeLevelStream.incRef();
}
cb = sctx.stripeLevelStream;
} else {
// This stream can be separated by RG using index. Let's do that.
// Offset to where this RG begins.
long cOffset = sctx.offset + index.getPositions(sctx.streamIndexOffset);
// Offset relative to the beginning of the stream of where this RG ends.
long nextCOffsetRel = isLastRg ? sctx.length : nextIndex.getPositions(sctx.streamIndexOffset);
// Offset before which this RG is guaranteed to end. Can only be estimated.
// We estimate the same way for compressed and uncompressed for now.
long endCOffset = sctx.offset + RecordReaderUtils.estimateRgEndOffset(isCompressed, isLastRg, nextCOffsetRel, sctx.length, bufferSize);
// As we read, we can unlock initial refcounts for the buffers that end before
// the data that we need for this RG.
long unlockUntilCOffset = sctx.offset + nextCOffsetRel;
cb = createRgColumnStreamData(rgIx, isLastRg, ctx.colIx, sctx, cOffset, endCOffset, isCompressed);
boolean isStartOfStream = sctx.bufferIter == null;
DiskRangeList lastCached = readEncodedStream(stripeOffset, (isStartOfStream ? iter : sctx.bufferIter), cOffset, endCOffset, cb, unlockUntilCOffset, sctx.offset);
if (lastCached != null) {
sctx.bufferIter = iter = lastCached;
}
}
ecb.setStreamData(ctx.colIx, sctx.kind.getNumber(), cb);
} catch (Exception ex) {
DiskRangeList drl = toRead == null ? null : toRead.next;
LOG.error("Error getting stream [" + sctx.kind + ", " + ctx.encoding + "] for" + " column " + ctx.colIx + " RG " + rgIx + " at " + sctx.offset + ", " + sctx.length + "; toRead " + RecordReaderUtils.stringifyDiskRanges(drl), ex);
throw (ex instanceof IOException) ? (IOException) ex : new IOException(ex);
}
}
}
if (isRGSelected) {
consumer.consumeData(ecb);
}
}
if (isTracingEnabled) {
LOG.trace("Disk ranges after preparing all the data " + RecordReaderUtils.stringifyDiskRanges(toRead.next));
}
// Release the unreleased buffers. See class comment about refcounts.
releaseInitialRefcounts(toRead.next);
releaseCacheChunksIntoObjectPool(toRead.next);
}
use of org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData in project hive by apache.
the class EncodedReaderImpl method createRgColumnStreamData.
private ColumnStreamData createRgColumnStreamData(int rgIx, boolean isLastRg, int colIx, StreamContext sctx, long cOffset, long endCOffset, boolean isCompressed) {
ColumnStreamData cb = POOLS.csdPool.take();
cb.incRef();
if (isTracingEnabled) {
LOG.trace("Getting data for column " + colIx + " " + (isLastRg ? "last " : "") + "RG " + rgIx + " stream " + sctx.kind + " at " + sctx.offset + ", " + sctx.length + " index position " + sctx.streamIndexOffset + ": " + (isCompressed ? "" : "un") + "compressed [" + cOffset + ", " + endCOffset + ")");
}
return cb;
}
use of org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData in project hive by apache.
the class EncodedTreeReaderFactory method createEncodedTreeReader.
private static TreeReader createEncodedTreeReader(TypeDescription schema, List<OrcProto.ColumnEncoding> encodings, OrcEncodedColumnBatch batch, CompressionCodec codec, TreeReaderFactory.Context context) throws IOException {
int columnIndex = schema.getId();
ColumnStreamData[] streamBuffers = null;
List<ColumnVector> vectors = null;
if (batch.hasData(columnIndex)) {
streamBuffers = batch.getColumnData(columnIndex);
} else if (batch.hasVectors(columnIndex)) {
vectors = batch.getColumnVectors(columnIndex);
} else {
throw new AssertionError("Batch has no data for " + columnIndex + ": " + batch);
}
// EncodedColumnBatch is already decompressed, we don't really need to pass codec.
// But we need to know if the original data is compressed or not. This is used to skip
// positions in row index properly. If the file is originally compressed,
// then 1st position (compressed offset) in row index should be skipped to get
// uncompressed offset, else 1st position should not be skipped.
// TODO: there should be a better way to do this, code just needs to be modified
OrcProto.ColumnEncoding columnEncoding = encodings.get(columnIndex);
// stream buffers are arranged in enum order of stream kind
ColumnStreamData present = null, data = null, dictionary = null, lengths = null, secondary = null;
if (streamBuffers != null) {
present = streamBuffers[OrcProto.Stream.Kind.PRESENT_VALUE];
data = streamBuffers[OrcProto.Stream.Kind.DATA_VALUE];
dictionary = streamBuffers[OrcProto.Stream.Kind.DICTIONARY_DATA_VALUE];
lengths = streamBuffers[OrcProto.Stream.Kind.LENGTH_VALUE];
secondary = streamBuffers[OrcProto.Stream.Kind.SECONDARY_VALUE];
}
if (LOG.isDebugEnabled()) {
LOG.debug("columnIndex: {} columnType: {} streamBuffers.length: {} vectors: {} columnEncoding: {}" + " present: {} data: {} dictionary: {} lengths: {} secondary: {} tz: {}", columnIndex, schema, streamBuffers == null ? 0 : streamBuffers.length, vectors == null ? 0 : vectors.size(), columnEncoding, present != null, data, dictionary != null, lengths != null, secondary != null, context.getWriterTimezone());
}
// TODO: get rid of the builders - they serve no purpose... just call ctors directly.
switch(schema.getCategory()) {
case BINARY:
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
case FLOAT:
case DOUBLE:
case CHAR:
case VARCHAR:
case STRING:
case DECIMAL:
case TIMESTAMP:
case DATE:
return getPrimitiveTreeReader(columnIndex, schema, codec, columnEncoding, present, data, dictionary, lengths, secondary, context, vectors);
case LIST:
// Not currently supported.
assert vectors == null;
TypeDescription elementType = schema.getChildren().get(0);
TreeReader elementReader = createEncodedTreeReader(elementType, encodings, batch, codec, context);
return ListStreamReader.builder().setColumnIndex(columnIndex).setColumnEncoding(columnEncoding).setCompressionCodec(codec).setPresentStream(present).setLengthStream(lengths).setElementReader(elementReader).setContext(context).build();
case MAP:
// Not currently supported.
assert vectors == null;
TypeDescription keyType = schema.getChildren().get(0);
TypeDescription valueType = schema.getChildren().get(1);
TreeReader keyReader = createEncodedTreeReader(keyType, encodings, batch, codec, context);
TreeReader valueReader = createEncodedTreeReader(valueType, encodings, batch, codec, context);
return MapStreamReader.builder().setColumnIndex(columnIndex).setColumnEncoding(columnEncoding).setCompressionCodec(codec).setPresentStream(present).setLengthStream(lengths).setKeyReader(keyReader).setValueReader(valueReader).setContext(context).build();
case STRUCT:
{
// Not currently supported.
assert vectors == null;
int childCount = schema.getChildren().size();
TreeReader[] childReaders = new TreeReader[childCount];
for (int i = 0; i < childCount; i++) {
TypeDescription childType = schema.getChildren().get(i);
childReaders[i] = createEncodedTreeReader(childType, encodings, batch, codec, context);
}
return StructStreamReader.builder().setColumnIndex(columnIndex).setCompressionCodec(codec).setColumnEncoding(columnEncoding).setPresentStream(present).setChildReaders(childReaders).setContext(context).build();
}
case UNION:
{
// Not currently supported.
assert vectors == null;
int childCount = schema.getChildren().size();
TreeReader[] childReaders = new TreeReader[childCount];
for (int i = 0; i < childCount; i++) {
TypeDescription childType = schema.getChildren().get(i);
childReaders[i] = createEncodedTreeReader(childType, encodings, batch, codec, context);
}
return UnionStreamReader.builder().setColumnIndex(columnIndex).setCompressionCodec(codec).setColumnEncoding(columnEncoding).setPresentStream(present).setDataStream(data).setChildReaders(childReaders).setContext(context).build();
}
default:
throw new UnsupportedOperationException("Data type not supported: " + schema);
}
}
use of org.apache.hadoop.hive.common.io.encoded.EncodedColumnBatch.ColumnStreamData in project hive by apache.
the class SerDeEncodedDataReader method processOneSlice.
private boolean processOneSlice(CacheWriter.CacheStripeData diskData, boolean[] splitIncludes, int stripeIx, StripeData cacheData, long startTime) throws IOException {
logProcessOneSlice(stripeIx, diskData, cacheData);
ColumnEncoding[] cacheEncodings = cacheData == null ? null : cacheData.getEncodings();
LlapDataBuffer[][][] cacheBuffers = cacheData == null ? null : cacheData.getData();
long cacheRowCount = cacheData == null ? -1L : cacheData.getRowCount();
SerDeStripeMetadata metadata = new SerDeStripeMetadata(stripeIx);
StripeData sliceToCache = null;
boolean hasAllData = diskData == null;
if (!hasAllData) {
sliceToCache = createSliceToCache(diskData, cacheData);
metadata.setEncodings(combineCacheAndWriterEncodings(cacheEncodings, diskData.encodings));
metadata.setRowCount(diskData.rowCount);
} else {
metadata.setEncodings(Lists.newArrayList(cacheEncodings));
metadata.setRowCount(cacheRowCount);
}
if (LlapIoImpl.LOG.isTraceEnabled()) {
LlapIoImpl.LOG.trace("Derived stripe metadata for this split is " + metadata);
}
consumer.setStripeMetadata(metadata);
OrcEncodedColumnBatch ecb = ECB_POOL.take();
ecb.init(fileKey, metadata.getStripeIx(), OrcEncodedColumnBatch.ALL_RGS, writerIncludes.length);
for (int colIx = 0; colIx < writerIncludes.length; ++colIx) {
if (!writerIncludes[colIx])
continue;
ecb.initColumn(colIx, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
if (!hasAllData && splitIncludes[colIx]) {
// The column has been read from disk.
List<CacheWriter.CacheStreamData> streams = diskData.colStreams.get(colIx);
LlapDataBuffer[][] newCacheDataForCol = createArrayToCache(sliceToCache, colIx, streams);
// Struct column, such as root?
if (streams == null)
continue;
Iterator<CacheWriter.CacheStreamData> iter = streams.iterator();
while (iter.hasNext()) {
CacheWriter.CacheStreamData stream = iter.next();
if (stream.isSuppressed) {
if (LlapIoImpl.LOG.isTraceEnabled()) {
LlapIoImpl.LOG.trace("Removing a suppressed stream " + stream.name);
}
iter.remove();
discardUncachedBuffers(stream.data);
continue;
}
int streamIx = setStreamDataToCache(newCacheDataForCol, stream);
ColumnStreamData cb = CSD_POOL.take();
cb.incRef();
cb.setCacheBuffers(stream.data);
ecb.setStreamData(colIx, streamIx, cb);
}
} else {
processColumnCacheData(cacheBuffers, ecb, colIx);
}
}
if (processStop()) {
recordReaderTime(startTime);
return false;
}
// but for now just rely on the cache put to lock them before we send them over.
if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
LlapIoImpl.CACHE_LOGGER.trace("Data to cache from the read " + sliceToCache);
}
cacheFileData(sliceToCache);
return sendEcbToConsumer(ecb, cacheData != null, diskData);
}
Aggregations