use of org.apache.hadoop.hive.ql.io.BucketIdentifier in project hive by apache.
the class LlapRecordReader method next.
@Override
public boolean next(NullWritable key, VectorizedRowBatch vrb) throws IOException {
assert vrb != null;
if (isClosed) {
throw new AssertionError("next called after close");
}
// Add partition cols if necessary (see VectorizedOrcInputFormat for details).
boolean wasFirst = isFirst;
if (isFirst) {
if (partitionValues != null) {
rbCtx.addPartitionColsToBatch(vrb, partitionValues);
}
isFirst = false;
}
ColumnVectorBatch cvb;
try {
cvb = nextCvb();
} catch (InterruptedException e) {
// Query might have been canceled. Stop the background processing.
feedback.stop();
// In case we are stuck in consume.
isInterrupted = true;
throw new IOException(e);
}
if (cvb == null) {
if (wasFirst) {
firstReturnTime = counters.startTimeCounter();
}
counters.incrWallClockCounter(LlapIOCounters.CONSUMER_TIME_NS, firstReturnTime);
return false;
}
if (isAcidFormat) {
// why?
vrb.selectedInUse = true;
if (isVectorized) {
// TODO: relying everywhere on the magical constants and columns being together means ACID
// columns are going to be super hard to change in a backward compat manner. I can
// foresee someone cursing while refactoring all the magic for prefix schema changes.
/*
Acid meta cols are always either all included or all excluded the
the width of 'cvb' changes accordingly so 'acidColCount' and
'ixInVrb' need to be adjusted. See {@link IncludesImpl} comments.
*/
// Exclude the row column.
int acidColCount = acidReader.includeAcidColumns() ? OrcInputFormat.getRootColumn(false) - 1 : 0;
ensureAcidInputVrb(acidColCount, vrb.getDataColumnCount());
// By assumption, ACID columns are currently always in the beginning of the arrays.
System.arraycopy(cvb.cols, 0, acidInputVrb.cols, 0, acidColCount);
for (int ixInReadSet = acidColCount; ixInReadSet < cvb.cols.length; ++ixInReadSet) {
int ixInVrb = includes.getPhysicalColumnIds().get(ixInReadSet) - (acidReader.includeAcidColumns() ? 0 : OrcRecordUpdater.ROW);
cvb.swapColumnVector(ixInReadSet, acidInputVrb.cols, ixInVrb);
}
acidInputVrb.size = cvb.size;
acidReader.setBaseAndInnerReader(new AcidWrapper(acidInputVrb));
acidReader.next(NullWritable.get(), vrb);
} else {
// TODO: WTF? The old code seems to just drop the ball here.
throw new AssertionError("Unsupported mode");
}
} else {
List<Integer> logicalOrderedColumnIds = includes.getLogicalOrderedColumnIds();
long cvbColsPresent = Arrays.stream(cvb.cols).filter(Objects::nonNull).count();
if (logicalOrderedColumnIds.size() != cvbColsPresent) {
throw new RuntimeException("Unexpected number of columns, VRB has " + logicalOrderedColumnIds.size() + " included, but the reader returned " + cvbColsPresent);
}
// schema, they are dealt with later.
for (int ixInReadSet = 0; ixInReadSet < cvbColsPresent; ++ixInReadSet) {
int ixInVrb = logicalOrderedColumnIds.get(ixInReadSet);
cvb.swapColumnVector(ixInReadSet, vrb.cols, ixInVrb);
}
// null out col vectors for which the (ORC) file had no data
List<Integer> missingColIndices = includes.getReaderLogicalColumnIds().stream().filter(idx -> !includes.getLogicalOrderedColumnIds().contains(idx)).collect(toList());
if (missingColIndices.size() != (cvb.cols.length - cvbColsPresent)) {
throw new RuntimeException("Unexpected number of missing columns, expected " + missingColIndices.size() + ", but reader returned " + (cvb.cols.length - cvbColsPresent) + " missing column vectors.");
}
for (int index : missingColIndices) {
vrb.cols[index].noNulls = false;
vrb.cols[index].isRepeating = true;
vrb.cols[index].isNull[0] = true;
}
// why?
vrb.selectedInUse = false;
vrb.size = cvb.size;
}
if (wasFirst) {
firstReturnTime = counters.startTimeCounter();
}
if (bucketIdentifier != null) {
rbCtx.setBucketAndWriteIdOf(vrb, bucketIdentifier);
}
return true;
}
Aggregations