use of org.apache.hadoop.hive.ql.io.orc.encoded.Reader in project hive by apache.
the class OrcEncodedDataReader method getOrcTailForPath.
/**
* Looks up metadata for the given Orc file in the cache. Will read it in, in case of a cache miss.
* @param path
* @param jobConf
* @param tag
* @param daemonConf
* @param metadataCache
* @return
* @throws IOException
*/
public static OrcTail getOrcTailForPath(Path path, Configuration jobConf, CacheTag tag, Configuration daemonConf, MetadataCache metadataCache, Object fileKey) throws IOException {
Supplier<FileSystem> fsSupplier = getFsSupplier(path, jobConf);
if (fileKey == null) {
fileKey = determineFileId(fsSupplier, path, daemonConf);
}
if (fileKey == null || metadataCache == null) {
throw new IllegalCacheConfigurationException("LLAP metadata cache not available for path " + path.toString());
}
LlapBufferOrBuffers tailBuffers = metadataCache.getFileMetadata(fileKey);
try {
// Cache hit
if (tailBuffers != null) {
return getOrcTailFromLlapBuffers(tailBuffers);
}
// Cache miss
throwIfCacheOnlyRead(HiveConf.getBoolVar(jobConf, ConfVars.LLAP_IO_CACHE_ONLY));
ReaderOptions opts = EncodedOrcFile.readerOptions(jobConf).filesystem(fsSupplier);
Reader reader = EncodedOrcFile.createReader(path, opts);
ByteBuffer tailBufferBb = reader.getSerializedFileFooter();
tailBuffers = metadataCache.putFileMetadata(fileKey, tailBufferBb, tag, new AtomicBoolean(false));
return getOrcTailFromLlapBuffers(tailBuffers);
} finally {
// By this time buffers got locked at either cache look up or cache insert times.
if (tailBuffers != null) {
metadataCache.decRefBuffer(tailBuffers);
}
}
}
use of org.apache.hadoop.hive.ql.io.orc.encoded.Reader in project hive by apache.
the class LlapRecordReader method next.
@Override
public boolean next(NullWritable key, VectorizedRowBatch vrb) throws IOException {
assert vrb != null;
if (isClosed) {
throw new AssertionError("next called after close");
}
// Add partition cols if necessary (see VectorizedOrcInputFormat for details).
boolean wasFirst = isFirst;
if (isFirst) {
if (partitionValues != null) {
rbCtx.addPartitionColsToBatch(vrb, partitionValues);
}
isFirst = false;
}
ColumnVectorBatch cvb;
try {
cvb = nextCvb();
} catch (InterruptedException e) {
// Query might have been canceled. Stop the background processing.
feedback.stop();
// In case we are stuck in consume.
isInterrupted = true;
throw new IOException(e);
}
if (cvb == null) {
if (wasFirst) {
firstReturnTime = counters.startTimeCounter();
}
counters.incrWallClockCounter(LlapIOCounters.CONSUMER_TIME_NS, firstReturnTime);
return false;
}
if (isAcidFormat) {
// why?
vrb.selectedInUse = true;
if (isVectorized) {
// TODO: relying everywhere on the magical constants and columns being together means ACID
// columns are going to be super hard to change in a backward compat manner. I can
// foresee someone cursing while refactoring all the magic for prefix schema changes.
/*
Acid meta cols are always either all included or all excluded the
the width of 'cvb' changes accordingly so 'acidColCount' and
'ixInVrb' need to be adjusted. See {@link IncludesImpl} comments.
*/
// Exclude the row column.
int acidColCount = acidReader.includeAcidColumns() ? OrcInputFormat.getRootColumn(false) - 1 : 0;
ensureAcidInputVrb(acidColCount, vrb.getDataColumnCount());
// By assumption, ACID columns are currently always in the beginning of the arrays.
System.arraycopy(cvb.cols, 0, acidInputVrb.cols, 0, acidColCount);
for (int ixInReadSet = acidColCount; ixInReadSet < cvb.cols.length; ++ixInReadSet) {
int ixInVrb = includes.getPhysicalColumnIds().get(ixInReadSet) - (acidReader.includeAcidColumns() ? 0 : OrcRecordUpdater.ROW);
cvb.swapColumnVector(ixInReadSet, acidInputVrb.cols, ixInVrb);
}
acidInputVrb.size = cvb.size;
acidReader.setBaseAndInnerReader(new AcidWrapper(acidInputVrb));
acidReader.next(NullWritable.get(), vrb);
} else {
// TODO: WTF? The old code seems to just drop the ball here.
throw new AssertionError("Unsupported mode");
}
} else {
List<Integer> logicalOrderedColumnIds = includes.getLogicalOrderedColumnIds();
long cvbColsPresent = Arrays.stream(cvb.cols).filter(Objects::nonNull).count();
if (logicalOrderedColumnIds.size() != cvbColsPresent) {
throw new RuntimeException("Unexpected number of columns, VRB has " + logicalOrderedColumnIds.size() + " included, but the reader returned " + cvbColsPresent);
}
// schema, they are dealt with later.
for (int ixInReadSet = 0; ixInReadSet < cvbColsPresent; ++ixInReadSet) {
int ixInVrb = logicalOrderedColumnIds.get(ixInReadSet);
cvb.swapColumnVector(ixInReadSet, vrb.cols, ixInVrb);
}
// null out col vectors for which the (ORC) file had no data
List<Integer> missingColIndices = includes.getReaderLogicalColumnIds().stream().filter(idx -> !includes.getLogicalOrderedColumnIds().contains(idx)).collect(toList());
if (missingColIndices.size() != (cvb.cols.length - cvbColsPresent)) {
throw new RuntimeException("Unexpected number of missing columns, expected " + missingColIndices.size() + ", but reader returned " + (cvb.cols.length - cvbColsPresent) + " missing column vectors.");
}
for (int index : missingColIndices) {
vrb.cols[index].noNulls = false;
vrb.cols[index].isRepeating = true;
vrb.cols[index].isNull[0] = true;
}
// why?
vrb.selectedInUse = false;
vrb.size = cvb.size;
}
if (wasFirst) {
firstReturnTime = counters.startTimeCounter();
}
if (bucketIdentifier != null) {
rbCtx.setBucketAndWriteIdOf(vrb, bucketIdentifier);
}
return true;
}
Aggregations