use of org.apache.orc.impl.OrcTail in project hive by apache.
the class OrcFileFormatProxy method applySargToMetadata.
@Override
public SplitInfos applySargToMetadata(SearchArgument sarg, ByteBuffer fileMetadata) throws IOException {
// TODO: ideally we should store shortened representation of only the necessary fields
// in HBase; it will probably require custom SARG application code.
OrcTail orcTail = ReaderImpl.extractFileTail(fileMetadata);
OrcProto.Footer footer = orcTail.getFooter();
int stripeCount = footer.getStripesCount();
boolean[] result = OrcInputFormat.pickStripesViaTranslatedSarg(sarg, orcTail.getWriterVersion(), footer.getTypesList(), orcTail.getStripeStatistics(), stripeCount);
// For ORC case, send the boundaries of the stripes so we don't have to send the footer.
SplitInfos.Builder sb = SplitInfos.newBuilder();
List<StripeInformation> stripes = orcTail.getStripes();
boolean isEliminated = true;
for (int i = 0; i < result.length; ++i) {
if (result != null && !result[i])
continue;
isEliminated = false;
StripeInformation si = stripes.get(i);
if (LOG.isDebugEnabled()) {
LOG.debug("PPD is adding a split " + i + ": " + si.getOffset() + ", " + si.getLength());
}
sb.addInfos(SplitInfo.newBuilder().setIndex(i).setOffset(si.getOffset()).setLength(si.getLength()));
}
return isEliminated ? null : sb.build();
}
use of org.apache.orc.impl.OrcTail in project hive by apache.
the class OrcEncodedDataReader method getFileFooterFromCacheOrDisk.
/**
* Gets file metadata for the split from cache, or reads it from the file.
*/
private OrcFileMetadata getFileFooterFromCacheOrDisk() throws IOException {
LlapBufferOrBuffers tailBuffers = null;
List<StripeStatistics> stats = null;
List<StripeInformation> stripes = null;
boolean hasCache = fileKey != null && metadataCache != null;
if (hasCache) {
tailBuffers = metadataCache.getFileMetadata(fileKey);
if (tailBuffers != null) {
try {
OrcTail orcTail = getOrcTailFromLlapBuffers(tailBuffers);
counters.incrCounter(LlapIOCounters.METADATA_CACHE_HIT);
FileTail tail = orcTail.getFileTail();
stats = getStripeStatsFromOrcTail(orcTail);
stripes = new ArrayList<>(tail.getFooter().getStripesCount());
int stripeIdx = 0;
for (OrcProto.StripeInformation stripeProto : tail.getFooter().getStripesList()) {
stripes.add(new ReaderImpl.StripeInformationImpl(stripeProto, stripeIdx++, -1, null));
}
return new OrcFileMetadata(fileKey, tail.getFooter(), tail.getPostscript(), stats, stripes, ReaderImpl.getFileVersion(tail.getPostscript().getVersionList()));
} finally {
// We don't need the buffer anymore.
metadataCache.decRefBuffer(tailBuffers);
}
} else {
counters.incrCounter(LlapIOCounters.METADATA_CACHE_MISS);
throwIfCacheOnlyRead(isReadCacheOnly);
}
}
ensureOrcReader();
ByteBuffer tailBufferBb = orcReader.getSerializedFileFooter();
if (hasCache) {
tailBuffers = metadataCache.putFileMetadata(fileKey, tailBufferBb, cacheTag, isStopped);
// We don't use the cache's copy of the buffer.
metadataCache.decRefBuffer(tailBuffers);
}
FileTail ft = orcReader.getFileTail();
return new OrcFileMetadata(fileKey, ft.getFooter(), ft.getPostscript(), orcReader.getOrcProtoStripeStatistics(), orcReader.getStripes(), orcReader.getFileVersion());
}
use of org.apache.orc.impl.OrcTail in project hive by apache.
the class LlapIoImpl method llapVectorizedOrcReaderForPath.
@Override
public RecordReader<NullWritable, VectorizedRowBatch> llapVectorizedOrcReaderForPath(Object fileKey, Path path, CacheTag tag, List<Integer> tableIncludedCols, JobConf conf, long offset, long length, Reporter reporter) throws IOException {
OrcTail tail = null;
if (tag != null) {
// Tag information is required for metadata lookup only - which itself can be done later should this info be yet
// to be known
tail = getOrcTailFromCache(path, conf, tag, fileKey);
}
OrcSplit split = new OrcSplit(path, fileKey, offset, length, (String[]) null, tail, false, false, Lists.newArrayList(), 0, length, path.getParent(), null);
try {
LlapRecordReader rr = LlapRecordReader.create(conf, split, tableIncludedCols, HiveStringUtils.getHostname(), orcCvp, executor, null, null, reporter, daemonConf);
// May happen when attempting with unsupported schema evolution between reader and file schemas
if (rr == null) {
return null;
}
// This needs to be cleared as no partition values should be added to the result batches as constants.
rr.setPartitionValues(null);
// Triggers the IO thread pool to pick up this read job
rr.start();
return rr;
} catch (HiveException e) {
throw new IOException(e);
}
}
use of org.apache.orc.impl.OrcTail in project hive by apache.
the class HiveVectorizedReader method orcRecordReader.
private static RecordReader<NullWritable, VectorizedRowBatch> orcRecordReader(JobConf job, Reporter reporter, FileScanTask task, InputFile inputFile, Path path, long start, long length, List<Integer> readColumnIds, SyntheticFileId fileId) throws IOException {
RecordReader<NullWritable, VectorizedRowBatch> recordReader = null;
// Need to turn positional schema evolution off since we use column name based schema evolution for projection
// and Iceberg will make a mapping between the file schema and the current reading schema.
job.setBoolean(OrcConf.FORCE_POSITIONAL_EVOLUTION.getHiveConfName(), false);
// Metadata information has to be passed along in the OrcSplit. Without specifying this, the vectorized
// reader will assume that the ORC file ends at the task's start + length, and might fail reading the tail..
ByteBuffer serializedOrcTail = VectorizedReadUtils.getSerializedOrcTail(inputFile, fileId, job);
OrcTail orcTail = VectorizedReadUtils.deserializeToOrcTail(serializedOrcTail);
VectorizedReadUtils.handleIcebergProjection(task, job, VectorizedReadUtils.deserializeToShadedOrcTail(serializedOrcTail).getSchema());
// If LLAP enabled, try to retrieve an LLAP record reader - this might yield to null in some special cases
if (HiveConf.getBoolVar(job, HiveConf.ConfVars.LLAP_IO_ENABLED, LlapProxy.isDaemon()) && LlapProxy.getIo() != null) {
// Required to prevent LLAP from dealing with decimal64, HiveIcebergInputFormat.getSupportedFeatures()
HiveConf.setVar(job, HiveConf.ConfVars.HIVE_VECTORIZED_INPUT_FORMAT_SUPPORTS_ENABLED, "");
recordReader = LlapProxy.getIo().llapVectorizedOrcReaderForPath(fileId, path, null, readColumnIds, job, start, length, reporter);
}
if (recordReader == null) {
InputSplit split = new OrcSplit(path, fileId, start, length, (String[]) null, orcTail, false, false, com.google.common.collect.Lists.newArrayList(), 0, length, path.getParent(), null);
recordReader = new VectorizedOrcInputFormat().getRecordReader(split, job, reporter);
}
return recordReader;
}
use of org.apache.orc.impl.OrcTail in project hive by apache.
the class TestOrcMetadataCache method testGetOrcTailForPath.
@Test
public void testGetOrcTailForPath() throws Exception {
DummyMemoryManager mm = new DummyMemoryManager();
DummyCachePolicy cp = new DummyCachePolicy();
final int MAX_ALLOC = 64;
LlapDaemonCacheMetrics metrics = LlapDaemonCacheMetrics.create("", "");
BuddyAllocator alloc = new BuddyAllocator(false, false, 8, MAX_ALLOC, 1, 4 * 4096, 0, null, mm, metrics, null, true);
MetadataCache cache = new MetadataCache(alloc, mm, cp, true, metrics);
Path path = new Path("../data/files/alltypesorc");
Configuration jobConf = new Configuration();
Configuration daemonConf = new Configuration();
CacheTag tag = CacheTag.build("test-table");
OrcTail uncached = OrcEncodedDataReader.getOrcTailForPath(path, jobConf, tag, daemonConf, cache, null);
jobConf.set(HiveConf.ConfVars.LLAP_IO_CACHE_ONLY.varname, "true");
OrcTail cached = OrcEncodedDataReader.getOrcTailForPath(path, jobConf, tag, daemonConf, cache, null);
assertEquals(uncached.getSerializedTail(), cached.getSerializedTail());
assertEquals(uncached.getFileTail(), cached.getFileTail());
}
Aggregations