Search in sources :

Example 1 with ReaderImpl

use of org.apache.hive.iceberg.org.apache.orc.impl.ReaderImpl in project hive by apache.

the class VectorizedReadUtils method getSerializedOrcTail.

/**
 * Opens the ORC inputFile and reads the metadata information to construct a byte buffer with OrcTail content.
 * @param inputFile - the original ORC file - this needs to be accessed to retrieve the original schema for mapping
 * @param job - JobConf instance to adjust
 * @param fileId - FileID for the input file, serves as cache key in an LLAP setup
 * @throws IOException - errors relating to accessing the ORC file
 */
public static ByteBuffer getSerializedOrcTail(InputFile inputFile, SyntheticFileId fileId, JobConf job) throws IOException {
    ByteBuffer result = null;
    if (HiveConf.getBoolVar(job, HiveConf.ConfVars.LLAP_IO_ENABLED, LlapProxy.isDaemon()) && LlapProxy.getIo() != null) {
        MapWork mapWork = LlapHiveUtils.findMapWork(job);
        Path path = new Path(inputFile.location());
        PartitionDesc partitionDesc = LlapHiveUtils.partitionDescForPath(path, mapWork.getPathToPartitionInfo());
        // Note: Since Hive doesn't know about partition information of Iceberg tables, partitionDesc is only used to
        // deduct the table (and DB) name here.
        CacheTag cacheTag = HiveConf.getBoolVar(job, HiveConf.ConfVars.LLAP_TRACK_CACHE_USAGE) ? LlapHiveUtils.getDbAndTableNameForMetrics(path, true, partitionDesc) : null;
        try {
            // Schema has to be serialized and deserialized as it is passed between different packages of TypeDescription:
            // Iceberg expects org.apache.hive.iceberg.org.apache.orc.TypeDescription as it shades ORC, while LLAP provides
            // the unshaded org.apache.orc.TypeDescription type.
            BufferChunk tailBuffer = LlapProxy.getIo().getOrcTailFromCache(path, job, cacheTag, fileId).getTailBuffer();
            result = tailBuffer.getData();
        } catch (IOException ioe) {
            LOG.warn("LLAP is turned on but was unable to get file metadata information through its cache for {}", path, ioe);
        }
    }
    // Fallback to simple ORC reader file opening method in lack of or failure of LLAP.
    if (result == null) {
        try (ReaderImpl orcFileReader = (ReaderImpl) ORC.newFileReader(inputFile, job)) {
            result = orcFileReader.getSerializedFileFooter();
        }
    }
    return result;
}
Also used : Path(org.apache.hadoop.fs.Path) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) ReaderImpl(org.apache.hive.iceberg.org.apache.orc.impl.ReaderImpl) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) IOException(java.io.IOException) BufferChunk(org.apache.orc.impl.BufferChunk) ByteBuffer(java.nio.ByteBuffer)

Aggregations

IOException (java.io.IOException)1 ByteBuffer (java.nio.ByteBuffer)1 Path (org.apache.hadoop.fs.Path)1 CacheTag (org.apache.hadoop.hive.common.io.CacheTag)1 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)1 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)1 ReaderImpl (org.apache.hive.iceberg.org.apache.orc.impl.ReaderImpl)1 BufferChunk (org.apache.orc.impl.BufferChunk)1