Search in sources :

Example 1 with TypeDescription

use of org.apache.hive.iceberg.org.apache.orc.TypeDescription in project hive by apache.

the class VectorizedReadUtils method handleIcebergProjection.

/**
 * Adjusts the jobConf so that column reorders and renames that might have happened since this ORC file was written
 * are properly mapped to the schema of the original file.
 * @param task - Iceberg task - required for
 * @param job - JobConf instance to adjust
 * @param fileSchema - ORC file schema of the input file
 * @throws IOException - errors relating to accessing the ORC file
 */
public static void handleIcebergProjection(FileScanTask task, JobConf job, TypeDescription fileSchema) throws IOException {
    // We need to map with the current (i.e. current Hive table columns) full schema (without projections),
    // as OrcInputFormat will take care of the projections by the use of an include boolean array
    PartitionSpec spec = task.spec();
    Schema currentSchema = spec.schema();
    TypeDescription readOrcSchema;
    if (ORCSchemaUtil.hasIds(fileSchema)) {
        readOrcSchema = ORCSchemaUtil.buildOrcProjection(currentSchema, fileSchema);
    } else {
        Schema readSchemaForOriginalFile = currentSchema;
        // In case of migrated, originally partitioned tables, partition values are not present in the file
        if (spec.isPartitioned()) {
            readSchemaForOriginalFile = currentSchema.select(currentSchema.columns().stream().filter(c -> !spec.identitySourceIds().contains(c.fieldId())).map(c -> c.name()).collect(Collectors.toList()));
        }
        TypeDescription typeWithIds = ORCSchemaUtil.applyNameMapping(fileSchema, MappingUtil.create(currentSchema));
        readOrcSchema = ORCSchemaUtil.buildOrcProjection(readSchemaForOriginalFile, typeWithIds);
    }
    job.set(ColumnProjectionUtils.ORC_SCHEMA_STRING, readOrcSchema.toString());
    // Predicate pushdowns needs to be adjusted too in case of column renames, we let Iceberg generate this into job
    if (task.residual() != null) {
        Expression boundFilter = Binder.bind(currentSchema.asStruct(), task.residual(), false);
        // Note the use of the unshaded version of this class here (required for SARG deseralization later)
        org.apache.hadoop.hive.ql.io.sarg.SearchArgument sarg = ExpressionToOrcSearchArgument.convert(boundFilter, readOrcSchema);
        if (sarg != null) {
            job.unset(TableScanDesc.FILTER_EXPR_CONF_STR);
            job.unset(ConvertAstToSearchArg.SARG_PUSHDOWN);
            job.set(ConvertAstToSearchArg.SARG_PUSHDOWN, ConvertAstToSearchArg.sargToKryo(sarg));
        }
    }
}
Also used : ConvertAstToSearchArg(org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg) TypeDescription(org.apache.hive.iceberg.org.apache.orc.TypeDescription) OrcTail(org.apache.hive.iceberg.org.apache.orc.impl.OrcTail) ColumnProjectionUtils(org.apache.hadoop.hive.serde2.ColumnProjectionUtils) LoggerFactory(org.slf4j.LoggerFactory) LlapProxy(org.apache.hadoop.hive.llap.io.api.LlapProxy) ByteBuffer(java.nio.ByteBuffer) TableScanDesc(org.apache.hadoop.hive.ql.plan.TableScanDesc) ReaderImpl(org.apache.hive.iceberg.org.apache.orc.impl.ReaderImpl) MappingUtil(org.apache.iceberg.mapping.MappingUtil) Expression(org.apache.iceberg.expressions.Expression) Path(org.apache.hadoop.fs.Path) SyntheticFileId(org.apache.hadoop.hive.ql.io.SyntheticFileId) FileScanTask(org.apache.iceberg.FileScanTask) BufferChunk(org.apache.orc.impl.BufferChunk) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) Logger(org.slf4j.Logger) Binder(org.apache.iceberg.expressions.Binder) HiveConf(org.apache.hadoop.hive.conf.HiveConf) IOException(java.io.IOException) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) LlapHiveUtils(org.apache.hadoop.hive.llap.LlapHiveUtils) JobConf(org.apache.hadoop.mapred.JobConf) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionSpec(org.apache.iceberg.PartitionSpec) InputFile(org.apache.iceberg.io.InputFile) Expression(org.apache.iceberg.expressions.Expression) Schema(org.apache.iceberg.Schema) TypeDescription(org.apache.hive.iceberg.org.apache.orc.TypeDescription) PartitionSpec(org.apache.iceberg.PartitionSpec)

Aggregations

IOException (java.io.IOException)1 ByteBuffer (java.nio.ByteBuffer)1 Collectors (java.util.stream.Collectors)1 Path (org.apache.hadoop.fs.Path)1 CacheTag (org.apache.hadoop.hive.common.io.CacheTag)1 HiveConf (org.apache.hadoop.hive.conf.HiveConf)1 LlapHiveUtils (org.apache.hadoop.hive.llap.LlapHiveUtils)1 LlapProxy (org.apache.hadoop.hive.llap.io.api.LlapProxy)1 SyntheticFileId (org.apache.hadoop.hive.ql.io.SyntheticFileId)1 ConvertAstToSearchArg (org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg)1 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)1 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)1 TableScanDesc (org.apache.hadoop.hive.ql.plan.TableScanDesc)1 ColumnProjectionUtils (org.apache.hadoop.hive.serde2.ColumnProjectionUtils)1 JobConf (org.apache.hadoop.mapred.JobConf)1 TypeDescription (org.apache.hive.iceberg.org.apache.orc.TypeDescription)1 OrcTail (org.apache.hive.iceberg.org.apache.orc.impl.OrcTail)1 ReaderImpl (org.apache.hive.iceberg.org.apache.orc.impl.ReaderImpl)1 FileScanTask (org.apache.iceberg.FileScanTask)1 PartitionSpec (org.apache.iceberg.PartitionSpec)1