use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class ListIndexColColumn method evaluate.
@Override
public void evaluate(VectorizedRowBatch batch) {
if (childExpressions != null) {
super.evaluateChildren(batch);
}
ColumnVector outV = batch.cols[outputColumnNum];
ListColumnVector listV = (ListColumnVector) batch.cols[listColumnNum];
ColumnVector childV = listV.child;
LongColumnVector indexColumnVector = (LongColumnVector) batch.cols[indexColumnNum];
long[] indexV = indexColumnVector.vector;
// We do not need to do a column reset since we are carefully changing the output.
outV.isRepeating = false;
if (listV.isRepeating) {
if (listV.isNull[0]) {
outV.isNull[0] = true;
outV.noNulls = false;
outV.isRepeating = true;
} else {
if (indexColumnVector.isRepeating) {
if (indexV[0] >= listV.lengths[0]) {
outV.isNull[0] = true;
outV.noNulls = false;
} else {
outV.isNull[0] = false;
outV.setElement(0, (int) (listV.offsets[0] + indexV[0]), childV);
}
outV.isRepeating = true;
} else {
for (int i = 0; i < batch.size; i++) {
int j = (batch.selectedInUse) ? batch.selected[i] : i;
if (indexV[j] >= listV.lengths[0]) {
outV.isNull[j] = true;
outV.noNulls = false;
} else {
outV.isNull[j] = false;
outV.setElement(j, (int) (listV.offsets[0] + indexV[j]), childV);
}
}
}
}
} else {
for (int i = 0; i < batch.size; i++) {
int j = (batch.selectedInUse) ? batch.selected[i] : i;
if (listV.isNull[j] || indexV[j] >= listV.lengths[j]) {
outV.isNull[j] = true;
outV.noNulls = false;
} else {
outV.isNull[j] = false;
outV.setElement(j, (int) (listV.offsets[j] + indexV[j]), childV);
}
}
}
}
use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class ListIndexColScalar method evaluate.
@Override
public void evaluate(VectorizedRowBatch batch) {
if (childExpressions != null) {
super.evaluateChildren(batch);
}
ColumnVector outV = batch.cols[outputColumnNum];
ListColumnVector listV = (ListColumnVector) batch.cols[listColumnNum];
ColumnVector childV = listV.child;
if (listV.isRepeating) {
if (listV.isNull[0]) {
outV.isNull[0] = true;
outV.noNulls = false;
} else {
if (index >= listV.lengths[0]) {
outV.isNull[0] = true;
outV.noNulls = false;
} else {
outV.isNull[0] = false;
outV.setElement(0, (int) (listV.offsets[0] + index), childV);
}
}
outV.isRepeating = true;
} else {
for (int i = 0; i < batch.size; i++) {
int j = (batch.selectedInUse) ? batch.selected[i] : i;
if (listV.isNull[j] || index >= listV.lengths[j]) {
outV.isNull[j] = true;
outV.noNulls = false;
} else {
outV.isNull[j] = false;
outV.setElement(j, (int) (listV.offsets[j] + index), childV);
}
}
outV.isRepeating = false;
}
}
use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class TestVectorGenericDateExpressions method testDateAddColCol.
private void testDateAddColCol(PrimitiveCategory colType1, boolean isPositive) throws HiveException {
LongColumnVector date1 = newRandomLongColumnVector(10000, size);
LongColumnVector days2 = newRandomLongColumnVector(1000, size);
ColumnVector col1 = castTo(date1, colType1);
LongColumnVector output = new LongColumnVector(size);
VectorizedRowBatch batch = new VectorizedRowBatch(3, size);
batch.cols[0] = col1;
batch.cols[1] = days2;
batch.cols[2] = output;
validateDateAdd(batch, date1, days2, colType1, isPositive);
TestVectorizedRowBatch.addRandomNulls(date1);
batch.cols[0] = castTo(date1, colType1);
validateDateAdd(batch, date1, days2, colType1, isPositive);
TestVectorizedRowBatch.addRandomNulls(days2);
batch.cols[1] = days2;
validateDateAdd(batch, date1, days2, colType1, isPositive);
}
use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class SerDeEncodedDataReader method processOneSlice.
/**
* Unlike the other overload of processOneSlice, doesn't cache data.
*/
private boolean processOneSlice(Vectors diskData, boolean[] splitIncludes, int stripeIx, StripeData cacheData, long startTime) throws IOException, InterruptedException {
if (diskData == null) {
// The other overload should have been used.
throw new AssertionError();
}
// LlapIoImpl.LOG.debug("diskData " + diskData);
logProcessOneSlice(stripeIx, diskData, cacheData);
if (cacheData == null && diskData.getRowCount() == 0) {
// Nothing to process.
return true;
}
ColumnEncoding[] cacheEncodings = cacheData == null ? null : cacheData.getEncodings();
LlapSerDeDataBuffer[][][] cacheBuffers = cacheData == null ? null : cacheData.getData();
if (cacheData != null) {
// Don't validate column count - no encodings for vectors.
validateCacheAndDisk(cacheData, diskData.getRowCount(), -1, diskData);
}
SerDeStripeMetadata metadata = new SerDeStripeMetadata(stripeIx);
metadata.setEncodings(Arrays.asList(cacheEncodings == null ? new ColumnEncoding[splitIncludes.length] : cacheEncodings));
metadata.setRowCount(diskData.getRowCount());
if (LlapIoImpl.LOG.isTraceEnabled()) {
LlapIoImpl.LOG.trace("Derived stripe metadata for this split is " + metadata);
}
consumer.setStripeMetadata(metadata);
OrcEncodedColumnBatch ecb = useObjectPools ? ECB_POOL.take() : new OrcEncodedColumnBatch();
ecb.init(fileKey, metadata.getStripeIx(), OrcEncodedColumnBatch.ALL_RGS, writerIncludes.length);
int vectorsIx = 0;
for (int colIx = 0; colIx < writerIncludes.length; ++colIx) {
// Skip the 0-th column, since it won't have a vector after reading the text source.
if (colIx == 0)
continue;
if (!writerIncludes[colIx])
continue;
if (splitIncludes[colIx]) {
List<ColumnVector> vectors = diskData.getVectors(vectorsIx++);
if (LlapIoImpl.LOG.isTraceEnabled()) {
LlapIoImpl.LOG.trace("Processing vectors for column " + colIx + ": " + vectors);
}
ecb.initColumnWithVectors(colIx, vectors);
} else {
ecb.initColumn(colIx, OrcEncodedColumnBatch.MAX_DATA_STREAMS);
processColumnCacheData(cacheBuffers, ecb, colIx);
}
}
if (processStop()) {
recordReaderTime(startTime);
return false;
}
return sendEcbToConsumer(ecb, cacheData != null, null);
}
use of org.apache.hadoop.hive.ql.exec.vector.ColumnVector in project hive by apache.
the class OrcEncodedDataConsumer method decodeBatch.
@Override
protected void decodeBatch(OrcEncodedColumnBatch batch, Consumer<ColumnVectorBatch> downstreamConsumer) throws InterruptedException {
long startTime = counters.startTimeCounter();
int currentStripeIndex = batch.getBatchKey().stripeIx;
boolean sameStripe = currentStripeIndex == previousStripeIndex;
try {
ConsumerStripeMetadata stripeMetadata = stripes.get(currentStripeIndex);
// Get non null row count from root column, to get max vector batches
int rgIdx = batch.getBatchKey().rgIx;
long nonNullRowCount = -1;
if (rgIdx == OrcEncodedColumnBatch.ALL_RGS) {
nonNullRowCount = stripeMetadata.getRowCount();
} else {
OrcProto.RowIndexEntry rowIndex = stripeMetadata.getRowIndexEntry(0, rgIdx);
nonNullRowCount = getRowCount(rowIndex);
}
int maxBatchesRG = (int) ((nonNullRowCount / VectorizedRowBatch.DEFAULT_SIZE) + 1);
int batchSize = VectorizedRowBatch.DEFAULT_SIZE;
TypeDescription fileSchema = fileMetadata.getSchema();
if (columnReaders == null || !sameStripe) {
createColumnReaders(batch, stripeMetadata, fileSchema);
} else {
repositionInStreams(this.columnReaders, batch, sameStripe, stripeMetadata);
}
previousStripeIndex = currentStripeIndex;
for (int i = 0; i < maxBatchesRG; i++) {
// for last batch in row group, adjust the batch size
if (i == maxBatchesRG - 1) {
batchSize = (int) (nonNullRowCount % VectorizedRowBatch.DEFAULT_SIZE);
if (batchSize == 0)
break;
}
ColumnVectorBatch cvb = cvbPool.take();
// assert cvb.cols.length == batch.getColumnIxs().length; // Must be constant per split.
cvb.size = batchSize;
for (int idx = 0; idx < columnReaders.length; ++idx) {
TreeReader reader = columnReaders[idx];
if (cvb.cols[idx] == null) {
// Orc store rows inside a root struct (hive writes it this way).
// When we populate column vectors we skip over the root struct.
cvb.cols[idx] = createColumn(batchSchemas[idx], VectorizedRowBatch.DEFAULT_SIZE);
}
trace.logTreeReaderNextVector(idx);
/*
* Currently, ORC's TreeReaderFactory class does this:
*
* public void nextBatch(VectorizedRowBatch batch,
* int batchSize) throws IOException {
* batch.cols[0].reset();
* batch.cols[0].ensureSize(batchSize, false);
* nextVector(batch.cols[0], null, batchSize);
* }
*
* CONCERN:
* For better performance, we'd like to *not* do a ColumnVector.reset()
* which zeroes out isNull. Why? Because there are common cases where
* ORC will *immediately* copy its null flags into the isNull array. This is a
* waste.
*
* For correctness now we must do it for now.
*
* The best solution is for ORC to manage the noNulls and isNull array itself
* because it knows what NULLs the next set of rows contains.
*
* Its management of the fields of ColumnVector is a little different than what we
* must do for vector expressions. For those, we must maintain the invariant that if
* noNulls is true there are no NULLs in any part of the isNull array. This is
* because the next vector expression relies on the invariant.
*
* Given that ORC (or any other producer) is providing *read-only* batches to the
* consumer, what is important is that the isNull array through batch.size has
* integrity with the noNulls flag. So, if ORC is giving us 100 rows (for example)
* and none of them are NULL, it can safely set or make sure the first 100 isNull
* entries are false and safely set noNulls to true. Any other NULLs (true entries)
* in isNull are irrelevant because ORC owns the batch. It just need to make sure
* it doesn't get confused.
*
*/
ColumnVector cv = cvb.cols[idx];
cv.reset();
cv.ensureSize(batchSize, false);
reader.nextVector(cv, null, batchSize);
}
// we are done reading a batch, send it to consumer for processing
downstreamConsumer.consumeData(cvb);
counters.incrCounter(LlapIOCounters.ROWS_EMITTED, batchSize);
}
LlapIoImpl.ORC_LOGGER.debug("Done with decode");
counters.incrTimeCounter(LlapIOCounters.DECODE_TIME_NS, startTime);
counters.incrCounter(LlapIOCounters.NUM_VECTOR_BATCHES, maxBatchesRG);
counters.incrCounter(LlapIOCounters.NUM_DECODED_BATCHES);
} catch (IOException e) {
// Caller will return the batch.
downstreamConsumer.setError(e);
}
}
Aggregations