use of org.apache.orc.OrcProto.ColumnEncoding in project hive by apache.
the class SerDeEncodedDataReader method combineCacheAndWriterEncodings.
private static List<ColumnEncoding> combineCacheAndWriterEncodings(ColumnEncoding[] cacheEncodings, List<ColumnEncoding> writerEncodings) throws IOException {
// TODO: refactor with cache impl? it has the same merge logic
if (cacheEncodings == null) {
return new ArrayList<>(writerEncodings);
}
if (cacheEncodings.length != writerEncodings.size()) {
throw new IOException("Incompatible encoding lengths: " + Arrays.toString(cacheEncodings) + " vs " + writerEncodings);
}
ColumnEncoding[] combinedEncodings = Arrays.copyOf(cacheEncodings, cacheEncodings.length);
for (int colIx = 0; colIx < cacheEncodings.length; ++colIx) {
ColumnEncoding newEncoding = writerEncodings.get(colIx);
if (newEncoding == null)
continue;
if (combinedEncodings[colIx] != null && !newEncoding.equals(combinedEncodings[colIx])) {
throw new IOException("Incompatible encodings at " + colIx + ": " + Arrays.toString(cacheEncodings) + " vs " + writerEncodings);
}
combinedEncodings[colIx] = newEncoding;
}
return Lists.newArrayList(combinedEncodings);
}
use of org.apache.orc.OrcProto.ColumnEncoding in project hive by apache.
the class SerDeEncodedDataReader method processOneSlice.
/** Unlike the other overload of processOneSlice, doesn't cache data. */
private boolean processOneSlice(Vectors diskData, boolean[] splitIncludes, int stripeIx, StripeData cacheData, long startTime) throws IOException {
if (diskData == null) {
// The other overload should have been used.
throw new AssertionError();
}
// LlapIoImpl.LOG.debug("diskData " + diskData);
logProcessOneSlice(stripeIx, diskData, cacheData);
if (cacheData == null && diskData.getRowCount() == 0) {
// Nothing to process.
return true;
}
ColumnEncoding[] cacheEncodings = cacheData == null ? null : cacheData.getEncodings();
LlapDataBuffer[][][] cacheBuffers = cacheData == null ? null : cacheData.getData();
if (cacheData != null) {
// Don't validate column count - no encodings for vectors.
validateCacheAndDisk(cacheData, diskData.getRowCount(), -1, diskData);
}
SerDeStripeMetadata metadata = new SerDeStripeMetadata(stripeIx);
metadata.setEncodings(Arrays.asList(cacheEncodings == null ? new ColumnEncoding[splitIncludes.length] : cacheEncodings));
metadata.setRowCount(diskData.getRowCount());
if (LlapIoImpl.LOG.isTraceEnabled()) {
LlapIoImpl.LOG.trace("Derived stripe metadata for this split is " + metadata);
}
consumer.setStripeMetadata(metadata);
OrcEncodedColumnBatch ecb = ECB_POOL.take();
ecb.init(fileKey, metadata.getStripeIx(), OrcEncodedColumnBatch.ALL_RGS, writerIncludes.length);
int vectorsIx = 0;
for (int colIx = 0; colIx < writerIncludes.length; ++colIx) {
if (!writerIncludes[colIx])
continue;
if (splitIncludes[colIx]) {
// Skip the 0-th column, since it won't have a vector after reading the text source.
if (colIx != 0) {
List<ColumnVector> vectors = diskData.getVectors(vectorsIx++);
if (LlapIoImpl.LOG.isTraceEnabled()) {
LlapIoImpl.LOG.trace("Processing vectors for column " + colIx + ": " + vectors);
}
ecb.initColumnWithVectors(colIx, vectors);
} else {
ecb.initColumn(0, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
}
} else {
ecb.initColumn(colIx, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
processColumnCacheData(cacheBuffers, ecb, colIx);
}
}
if (processStop()) {
recordReaderTime(startTime);
return false;
}
return sendEcbToConsumer(ecb, cacheData != null, null);
}
use of org.apache.orc.OrcProto.ColumnEncoding in project hive by apache.
the class SerDeEncodedDataReader method createSliceToCache.
private StripeData createSliceToCache(CacheWriter.CacheStripeData diskData, StripeData cacheData) throws IOException {
assert diskData != null;
if (cacheData == null) {
return new StripeData(diskData.knownTornStart, diskData.firstRowStart, diskData.lastRowStart, diskData.lastRowEnd, diskData.rowCount, diskData.encodings.toArray(new ColumnEncoding[diskData.encodings.size()]));
} else {
long rowCount = diskData.rowCount, encodingCount = diskData.encodings.size();
validateCacheAndDisk(cacheData, rowCount, encodingCount, diskData);
if (LlapIoImpl.LOG.isDebugEnabled()) {
LlapIoImpl.LOG.debug("Creating slice to cache in addition to an existing slice " + cacheData.toCoordinateString() + "; disk offsets were " + diskData.toCoordinateString());
}
// Note: we could just do what we already do above from disk data, except for the validation
// that is not strictly necessary, and knownTornStart which is an optimization.
StripeData sliceToCache = StripeData.duplicateStructure(cacheData);
for (int i = 0; i < diskData.encodings.size(); ++i) {
sliceToCache.getEncodings()[i] = diskData.encodings.get(i);
}
sliceToCache.setKnownTornStart(Math.min(diskData.knownTornStart, sliceToCache.getKnownTornStart()));
return sliceToCache;
}
}
use of org.apache.orc.OrcProto.ColumnEncoding in project hive by apache.
the class SerDeEncodedDataReader method processOneSlice.
private boolean processOneSlice(CacheWriter.CacheStripeData diskData, boolean[] splitIncludes, int stripeIx, StripeData cacheData, long startTime) throws IOException {
logProcessOneSlice(stripeIx, diskData, cacheData);
ColumnEncoding[] cacheEncodings = cacheData == null ? null : cacheData.getEncodings();
LlapDataBuffer[][][] cacheBuffers = cacheData == null ? null : cacheData.getData();
long cacheRowCount = cacheData == null ? -1L : cacheData.getRowCount();
SerDeStripeMetadata metadata = new SerDeStripeMetadata(stripeIx);
StripeData sliceToCache = null;
boolean hasAllData = diskData == null;
if (!hasAllData) {
sliceToCache = createSliceToCache(diskData, cacheData);
metadata.setEncodings(combineCacheAndWriterEncodings(cacheEncodings, diskData.encodings));
metadata.setRowCount(diskData.rowCount);
} else {
metadata.setEncodings(Lists.newArrayList(cacheEncodings));
metadata.setRowCount(cacheRowCount);
}
if (LlapIoImpl.LOG.isTraceEnabled()) {
LlapIoImpl.LOG.trace("Derived stripe metadata for this split is " + metadata);
}
consumer.setStripeMetadata(metadata);
OrcEncodedColumnBatch ecb = ECB_POOL.take();
ecb.init(fileKey, metadata.getStripeIx(), OrcEncodedColumnBatch.ALL_RGS, writerIncludes.length);
for (int colIx = 0; colIx < writerIncludes.length; ++colIx) {
if (!writerIncludes[colIx])
continue;
ecb.initColumn(colIx, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
if (!hasAllData && splitIncludes[colIx]) {
// The column has been read from disk.
List<CacheWriter.CacheStreamData> streams = diskData.colStreams.get(colIx);
LlapDataBuffer[][] newCacheDataForCol = createArrayToCache(sliceToCache, colIx, streams);
// Struct column, such as root?
if (streams == null)
continue;
Iterator<CacheWriter.CacheStreamData> iter = streams.iterator();
while (iter.hasNext()) {
CacheWriter.CacheStreamData stream = iter.next();
if (stream.isSuppressed) {
if (LlapIoImpl.LOG.isTraceEnabled()) {
LlapIoImpl.LOG.trace("Removing a suppressed stream " + stream.name);
}
iter.remove();
discardUncachedBuffers(stream.data);
continue;
}
int streamIx = setStreamDataToCache(newCacheDataForCol, stream);
ColumnStreamData cb = CSD_POOL.take();
cb.incRef();
cb.setCacheBuffers(stream.data);
ecb.setStreamData(colIx, streamIx, cb);
}
} else {
processColumnCacheData(cacheBuffers, ecb, colIx);
}
}
if (processStop()) {
recordReaderTime(startTime);
return false;
}
// but for now just rely on the cache put to lock them before we send them over.
if (LlapIoImpl.CACHE_LOGGER.isTraceEnabled()) {
LlapIoImpl.CACHE_LOGGER.trace("Data to cache from the read " + sliceToCache);
}
cacheFileData(sliceToCache);
return sendEcbToConsumer(ecb, cacheData != null, diskData);
}
use of org.apache.orc.OrcProto.ColumnEncoding in project hive by apache.
the class SerDeEncodedDataReader method cacheFileData.
public void cacheFileData(StripeData sd) {
if (sd == null || sd.getEncodings() == null)
return;
if (fileKey != null) {
// Note that we cache each slice separately. We could cache them together at the end, but
// then we won't be able to pass them to users without inc-refing explicitly.
ColumnEncoding[] encodings = sd.getEncodings();
for (int i = 0; i < encodings.length; ++i) {
// Make data consistent with encodings, don't store useless information.
if (sd.getData()[i] == null) {
encodings[i] = null;
} else if (encodings[i] == null) {
throw new AssertionError("Caching data without an encoding at " + i + ": " + sd);
}
}
FileData fd = new FileData(fileKey, encodings.length);
fd.addStripe(sd);
cache.putFileData(fd, Priority.NORMAL, counters);
} else {
lockAllBuffers(sd);
}
// We assume that if put/lock throws in the middle, it's ok to treat buffers as not being
// locked and to blindly deallocate them, since they are not going to be used. Therefore
// we don't remove them from the cleanup list - we will do it after sending to consumer.
// This relies on sequence of calls to cacheFileData and sendEcb..
}
Aggregations