Search in sources :

Example 1 with OrcTail

use of org.apache.orc.impl.OrcTail in project hive by apache.

the class OrcFileFormatProxy method applySargToMetadata.

@Override
public SplitInfos applySargToMetadata(SearchArgument sarg, ByteBuffer fileMetadata) throws IOException {
    // TODO: ideally we should store shortened representation of only the necessary fields
    // in HBase; it will probably require custom SARG application code.
    OrcTail orcTail = ReaderImpl.extractFileTail(fileMetadata);
    OrcProto.Footer footer = orcTail.getFooter();
    int stripeCount = footer.getStripesCount();
    boolean[] result = OrcInputFormat.pickStripesViaTranslatedSarg(sarg, orcTail.getWriterVersion(), footer.getTypesList(), orcTail.getStripeStatistics(), stripeCount);
    // For ORC case, send the boundaries of the stripes so we don't have to send the footer.
    SplitInfos.Builder sb = SplitInfos.newBuilder();
    List<StripeInformation> stripes = orcTail.getStripes();
    boolean isEliminated = true;
    for (int i = 0; i < result.length; ++i) {
        if (result != null && !result[i])
            continue;
        isEliminated = false;
        StripeInformation si = stripes.get(i);
        if (LOG.isDebugEnabled()) {
            LOG.debug("PPD is adding a split " + i + ": " + si.getOffset() + ", " + si.getLength());
        }
        sb.addInfos(SplitInfo.newBuilder().setIndex(i).setOffset(si.getOffset()).setLength(si.getLength()));
    }
    return isEliminated ? null : sb.build();
}
Also used : OrcProto(org.apache.orc.OrcProto) SplitInfos(org.apache.hadoop.hive.metastore.Metastore.SplitInfos) StripeInformation(org.apache.orc.StripeInformation) OrcTail(org.apache.orc.impl.OrcTail)

Example 2 with OrcTail

use of org.apache.orc.impl.OrcTail in project hive by apache.

the class OrcEncodedDataReader method getFileFooterFromCacheOrDisk.

/**
 *  Gets file metadata for the split from cache, or reads it from the file.
 */
private OrcFileMetadata getFileFooterFromCacheOrDisk() throws IOException {
    LlapBufferOrBuffers tailBuffers = null;
    List<StripeStatistics> stats = null;
    List<StripeInformation> stripes = null;
    boolean hasCache = fileKey != null && metadataCache != null;
    if (hasCache) {
        tailBuffers = metadataCache.getFileMetadata(fileKey);
        if (tailBuffers != null) {
            try {
                OrcTail orcTail = getOrcTailFromLlapBuffers(tailBuffers);
                counters.incrCounter(LlapIOCounters.METADATA_CACHE_HIT);
                FileTail tail = orcTail.getFileTail();
                stats = getStripeStatsFromOrcTail(orcTail);
                stripes = new ArrayList<>(tail.getFooter().getStripesCount());
                int stripeIdx = 0;
                for (OrcProto.StripeInformation stripeProto : tail.getFooter().getStripesList()) {
                    stripes.add(new ReaderImpl.StripeInformationImpl(stripeProto, stripeIdx++, -1, null));
                }
                return new OrcFileMetadata(fileKey, tail.getFooter(), tail.getPostscript(), stats, stripes, ReaderImpl.getFileVersion(tail.getPostscript().getVersionList()));
            } finally {
                // We don't need the buffer anymore.
                metadataCache.decRefBuffer(tailBuffers);
            }
        } else {
            counters.incrCounter(LlapIOCounters.METADATA_CACHE_MISS);
            throwIfCacheOnlyRead(isReadCacheOnly);
        }
    }
    ensureOrcReader();
    ByteBuffer tailBufferBb = orcReader.getSerializedFileFooter();
    if (hasCache) {
        tailBuffers = metadataCache.putFileMetadata(fileKey, tailBufferBb, cacheTag, isStopped);
        // We don't use the cache's copy of the buffer.
        metadataCache.decRefBuffer(tailBuffers);
    }
    FileTail ft = orcReader.getFileTail();
    return new OrcFileMetadata(fileKey, ft.getFooter(), ft.getPostscript(), orcReader.getOrcProtoStripeStatistics(), orcReader.getStripes(), orcReader.getFileVersion());
}
Also used : OrcFileMetadata(org.apache.hadoop.hive.llap.io.metadata.OrcFileMetadata) OrcProto(org.apache.orc.OrcProto) StripeStatistics(org.apache.orc.OrcProto.StripeStatistics) RecordReaderImpl(org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl) ReaderImpl(org.apache.orc.impl.ReaderImpl) ByteBuffer(java.nio.ByteBuffer) FileTail(org.apache.orc.OrcProto.FileTail) LlapBufferOrBuffers(org.apache.hadoop.hive.llap.io.metadata.MetadataCache.LlapBufferOrBuffers) StripeInformation(org.apache.orc.StripeInformation) OrcTail(org.apache.orc.impl.OrcTail)

Example 3 with OrcTail

use of org.apache.orc.impl.OrcTail in project hive by apache.

the class LlapIoImpl method llapVectorizedOrcReaderForPath.

@Override
public RecordReader<NullWritable, VectorizedRowBatch> llapVectorizedOrcReaderForPath(Object fileKey, Path path, CacheTag tag, List<Integer> tableIncludedCols, JobConf conf, long offset, long length, Reporter reporter) throws IOException {
    OrcTail tail = null;
    if (tag != null) {
        // Tag information is required for metadata lookup only - which itself can be done later should this info be yet
        // to be known
        tail = getOrcTailFromCache(path, conf, tag, fileKey);
    }
    OrcSplit split = new OrcSplit(path, fileKey, offset, length, (String[]) null, tail, false, false, Lists.newArrayList(), 0, length, path.getParent(), null);
    try {
        LlapRecordReader rr = LlapRecordReader.create(conf, split, tableIncludedCols, HiveStringUtils.getHostname(), orcCvp, executor, null, null, reporter, daemonConf);
        // May happen when attempting with unsupported schema evolution between reader and file schemas
        if (rr == null) {
            return null;
        }
        // This needs to be cleared as no partition values should be added to the result batches as constants.
        rr.setPartitionValues(null);
        // Triggers the IO thread pool to pick up this read job
        rr.start();
        return rr;
    } catch (HiveException e) {
        throw new IOException(e);
    }
}
Also used : OrcSplit(org.apache.hadoop.hive.ql.io.orc.OrcSplit) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException) OrcTail(org.apache.orc.impl.OrcTail)

Example 4 with OrcTail

use of org.apache.orc.impl.OrcTail in project hive by apache.

the class HiveVectorizedReader method orcRecordReader.

private static RecordReader<NullWritable, VectorizedRowBatch> orcRecordReader(JobConf job, Reporter reporter, FileScanTask task, InputFile inputFile, Path path, long start, long length, List<Integer> readColumnIds, SyntheticFileId fileId) throws IOException {
    RecordReader<NullWritable, VectorizedRowBatch> recordReader = null;
    // Need to turn positional schema evolution off since we use column name based schema evolution for projection
    // and Iceberg will make a mapping between the file schema and the current reading schema.
    job.setBoolean(OrcConf.FORCE_POSITIONAL_EVOLUTION.getHiveConfName(), false);
    // Metadata information has to be passed along in the OrcSplit. Without specifying this, the vectorized
    // reader will assume that the ORC file ends at the task's start + length, and might fail reading the tail..
    ByteBuffer serializedOrcTail = VectorizedReadUtils.getSerializedOrcTail(inputFile, fileId, job);
    OrcTail orcTail = VectorizedReadUtils.deserializeToOrcTail(serializedOrcTail);
    VectorizedReadUtils.handleIcebergProjection(task, job, VectorizedReadUtils.deserializeToShadedOrcTail(serializedOrcTail).getSchema());
    // If LLAP enabled, try to retrieve an LLAP record reader - this might yield to null in some special cases
    if (HiveConf.getBoolVar(job, HiveConf.ConfVars.LLAP_IO_ENABLED, LlapProxy.isDaemon()) && LlapProxy.getIo() != null) {
        // Required to prevent LLAP from dealing with decimal64, HiveIcebergInputFormat.getSupportedFeatures()
        HiveConf.setVar(job, HiveConf.ConfVars.HIVE_VECTORIZED_INPUT_FORMAT_SUPPORTS_ENABLED, "");
        recordReader = LlapProxy.getIo().llapVectorizedOrcReaderForPath(fileId, path, null, readColumnIds, job, start, length, reporter);
    }
    if (recordReader == null) {
        InputSplit split = new OrcSplit(path, fileId, start, length, (String[]) null, orcTail, false, false, com.google.common.collect.Lists.newArrayList(), 0, length, path.getParent(), null);
        recordReader = new VectorizedOrcInputFormat().getRecordReader(split, job, reporter);
    }
    return recordReader;
}
Also used : VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) OrcSplit(org.apache.hadoop.hive.ql.io.orc.OrcSplit) NullWritable(org.apache.hadoop.io.NullWritable) ByteBuffer(java.nio.ByteBuffer) InputSplit(org.apache.hadoop.mapred.InputSplit) VectorizedOrcInputFormat(org.apache.hadoop.hive.ql.io.orc.VectorizedOrcInputFormat) OrcTail(org.apache.orc.impl.OrcTail)

Example 5 with OrcTail

use of org.apache.orc.impl.OrcTail in project hive by apache.

the class TestOrcMetadataCache method testGetOrcTailForPath.

@Test
public void testGetOrcTailForPath() throws Exception {
    DummyMemoryManager mm = new DummyMemoryManager();
    DummyCachePolicy cp = new DummyCachePolicy();
    final int MAX_ALLOC = 64;
    LlapDaemonCacheMetrics metrics = LlapDaemonCacheMetrics.create("", "");
    BuddyAllocator alloc = new BuddyAllocator(false, false, 8, MAX_ALLOC, 1, 4 * 4096, 0, null, mm, metrics, null, true);
    MetadataCache cache = new MetadataCache(alloc, mm, cp, true, metrics);
    Path path = new Path("../data/files/alltypesorc");
    Configuration jobConf = new Configuration();
    Configuration daemonConf = new Configuration();
    CacheTag tag = CacheTag.build("test-table");
    OrcTail uncached = OrcEncodedDataReader.getOrcTailForPath(path, jobConf, tag, daemonConf, cache, null);
    jobConf.set(HiveConf.ConfVars.LLAP_IO_CACHE_ONLY.varname, "true");
    OrcTail cached = OrcEncodedDataReader.getOrcTailForPath(path, jobConf, tag, daemonConf, cache, null);
    assertEquals(uncached.getSerializedTail(), cached.getSerializedTail());
    assertEquals(uncached.getFileTail(), cached.getFileTail());
}
Also used : Path(org.apache.hadoop.fs.Path) LlapDaemonCacheMetrics(org.apache.hadoop.hive.llap.metrics.LlapDaemonCacheMetrics) Configuration(org.apache.hadoop.conf.Configuration) MetadataCache(org.apache.hadoop.hive.llap.io.metadata.MetadataCache) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) OrcTail(org.apache.orc.impl.OrcTail) Test(org.junit.Test)

Aggregations

OrcTail (org.apache.orc.impl.OrcTail)13 OrcProto (org.apache.orc.OrcProto)6 IOException (java.io.IOException)4 Path (org.apache.hadoop.fs.Path)4 ByteBuffer (java.nio.ByteBuffer)3 OrcSplit (org.apache.hadoop.hive.ql.io.orc.OrcSplit)3 StripeInformation (org.apache.orc.StripeInformation)3 Test (org.junit.Test)3 Configuration (org.apache.hadoop.conf.Configuration)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 CacheTag (org.apache.hadoop.hive.common.io.CacheTag)2 MetadataCache (org.apache.hadoop.hive.llap.io.metadata.MetadataCache)2 LlapDaemonCacheMetrics (org.apache.hadoop.hive.llap.metrics.LlapDaemonCacheMetrics)2 SplitInfos (org.apache.hadoop.hive.metastore.Metastore.SplitInfos)2 AcidInputFormat (org.apache.hadoop.hive.ql.io.AcidInputFormat)2 SyntheticFileId (org.apache.hadoop.hive.ql.io.SyntheticFileId)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 FileSystem (org.apache.hadoop.fs.FileSystem)1 IllegalCacheConfigurationException (org.apache.hadoop.hive.llap.IllegalCacheConfigurationException)1 LlapBufferOrBuffers (org.apache.hadoop.hive.llap.io.metadata.MetadataCache.LlapBufferOrBuffers)1