Search in sources :

Example 66 with Table

use of org.apache.iceberg.Table in project hive by apache.

the class HiveIcebergSerDe method initialize.

@Override
public void initialize(@Nullable Configuration configuration, Properties serDeProperties, Properties partitionProperties) throws SerDeException {
    super.initialize(configuration, serDeProperties, partitionProperties);
    if (serDeProperties.get(InputFormatConfig.TABLE_SCHEMA) != null) {
        this.tableSchema = SchemaParser.fromJson((String) serDeProperties.get(InputFormatConfig.TABLE_SCHEMA));
        if (serDeProperties.get(InputFormatConfig.PARTITION_SPEC) != null) {
            PartitionSpec spec = PartitionSpecParser.fromJson(tableSchema, serDeProperties.getProperty(InputFormatConfig.PARTITION_SPEC));
            this.partitionColumns = spec.fields().stream().map(PartitionField::name).collect(Collectors.toList());
        } else {
            this.partitionColumns = ImmutableList.of();
        }
    } else {
        try {
            Table table = IcebergTableUtil.getTable(configuration, serDeProperties);
            // always prefer the original table schema if there is one
            this.tableSchema = table.schema();
            this.partitionColumns = table.spec().fields().stream().map(PartitionField::name).collect(Collectors.toList());
            LOG.info("Using schema from existing table {}", SchemaParser.toJson(tableSchema));
        } catch (Exception e) {
            // During table creation we might not have the schema information from the Iceberg table, nor from the HMS
            // table. In this case we have to generate the schema using the serdeProperties which contains the info
            // provided in the CREATE TABLE query.
            boolean autoConversion = configuration.getBoolean(InputFormatConfig.SCHEMA_AUTO_CONVERSION, false);
            // If we can not load the table try the provided hive schema
            this.tableSchema = hiveSchemaOrThrow(e, autoConversion);
            // This is only for table creation, it is ok to have an empty partition column list
            this.partitionColumns = ImmutableList.of();
            // create table for CTAS
            if (e instanceof NoSuchTableException && Boolean.parseBoolean(serDeProperties.getProperty(hive_metastoreConstants.TABLE_IS_CTAS))) {
                if (!Catalogs.hiveCatalog(configuration, serDeProperties)) {
                    throw new SerDeException(CTAS_EXCEPTION_MSG);
                }
                createTableForCTAS(configuration, serDeProperties);
            }
        }
    }
    Schema projectedSchema;
    if (serDeProperties.get(HiveIcebergStorageHandler.WRITE_KEY) != null) {
        // when writing out data, we should not do projection pushdown
        projectedSchema = tableSchema;
    } else {
        configuration.setBoolean(InputFormatConfig.CASE_SENSITIVE, false);
        String[] selectedColumns = ColumnProjectionUtils.getReadColumnNames(configuration);
        // When same table is joined multiple times, it is possible some selected columns are duplicated,
        // in this case wrong recordStructField position leads wrong value or ArrayIndexOutOfBoundException
        String[] distinctSelectedColumns = Arrays.stream(selectedColumns).distinct().toArray(String[]::new);
        projectedSchema = distinctSelectedColumns.length > 0 ? tableSchema.caseInsensitiveSelect(distinctSelectedColumns) : tableSchema;
        // or we cannot find selectOperator's column from inspector
        if (projectedSchema.columns().size() != distinctSelectedColumns.length) {
            projectedSchema = tableSchema;
        }
    }
    try {
        this.inspector = IcebergObjectInspector.create(projectedSchema);
    } catch (Exception e) {
        throw new SerDeException(e);
    }
}
Also used : PartitionField(org.apache.iceberg.PartitionField) Table(org.apache.iceberg.Table) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) Schema(org.apache.iceberg.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) PartitionSpec(org.apache.iceberg.PartitionSpec) NoSuchTableException(org.apache.iceberg.exceptions.NoSuchTableException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Example 67 with Table

use of org.apache.iceberg.Table in project hive by apache.

the class HiveIcebergStorageHandler method getPartitionTransformSpec.

@Override
public List<PartitionTransformSpec> getPartitionTransformSpec(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
    List<PartitionTransformSpec> result = new ArrayList<>();
    TableDesc tableDesc = Utilities.getTableDesc(hmsTable);
    Table table = IcebergTableUtil.getTable(conf, tableDesc.getProperties());
    return table.spec().fields().stream().map(f -> {
        PartitionTransformSpec spec = new PartitionTransformSpec();
        spec.setColumnName(table.schema().findColumnName(f.sourceId()));
        // right now the only way to fetch the transform type and its params is through the toString() call
        String transformName = f.transform().toString().toUpperCase();
        // if the transform name contains '[' it means it has some config params
        if (transformName.contains("[")) {
            spec.setTransformType(PartitionTransformSpec.TransformType.valueOf(transformName.substring(0, transformName.indexOf("["))));
            spec.setTransformParam(Optional.of(Integer.valueOf(transformName.substring(transformName.indexOf("[") + 1, transformName.indexOf("]")))));
        } else {
            spec.setTransformType(PartitionTransformSpec.TransformType.valueOf(transformName));
            spec.setTransformParam(Optional.empty());
        }
        return spec;
    }).collect(Collectors.toList());
}
Also used : ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) HadoopConfigurable(org.apache.iceberg.hadoop.HadoopConfigurable) ListIterator(java.util.ListIterator) URISyntaxException(java.net.URISyntaxException) Catalogs(org.apache.iceberg.mr.Catalogs) LoggerFactory(org.slf4j.LoggerFactory) Date(org.apache.hadoop.hive.common.type.Date) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) JobID(org.apache.hadoop.mapred.JobID) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) StatsSetupConst(org.apache.hadoop.hive.common.StatsSetupConst) OutputCommitter(org.apache.hadoop.mapred.OutputCommitter) AlterTableType(org.apache.hadoop.hive.ql.ddl.table.AlterTableType) Throwables(org.apache.iceberg.relocated.com.google.common.base.Throwables) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) InputFormat(org.apache.hadoop.mapred.InputFormat) URI(java.net.URI) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) HiveStoragePredicateHandler(org.apache.hadoop.hive.ql.metadata.HiveStoragePredicateHandler) Splitter(org.apache.iceberg.relocated.com.google.common.base.Splitter) OutputFormat(org.apache.hadoop.mapred.OutputFormat) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) Collection(java.util.Collection) Partish(org.apache.hadoop.hive.ql.stats.Partish) HiveMetaHook(org.apache.hadoop.hive.metastore.HiveMetaHook) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc) InputFormatConfig(org.apache.iceberg.mr.InputFormatConfig) Schema(org.apache.iceberg.Schema) Collectors(java.util.stream.Collectors) SessionState(org.apache.hadoop.hive.ql.session.SessionState) PartitionSpecParser(org.apache.iceberg.PartitionSpecParser) Serializable(java.io.Serializable) SchemaParser(org.apache.iceberg.SchemaParser) List(java.util.List) Optional(java.util.Optional) TableProperties(org.apache.iceberg.TableProperties) SessionStateUtil(org.apache.hadoop.hive.ql.session.SessionStateUtil) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LockType(org.apache.hadoop.hive.metastore.api.LockType) ConvertAstToSearchArg(org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg) HashMap(java.util.HashMap) ExprNodeDynamicListDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDynamicListDesc) ArrayList(java.util.ArrayList) SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) JobStatus(org.apache.hadoop.mapred.JobStatus) PartitionTransformSpec(org.apache.hadoop.hive.ql.parse.PartitionTransformSpec) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) Properties(java.util.Properties) Logger(org.slf4j.Logger) Timestamp(org.apache.hadoop.hive.common.type.Timestamp) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) Table(org.apache.iceberg.Table) HiveConf(org.apache.hadoop.hive.conf.HiveConf) Maps(org.apache.iceberg.relocated.com.google.common.collect.Maps) IOException(java.io.IOException) SerializationUtil(org.apache.iceberg.util.SerializationUtil) JobConf(org.apache.hadoop.mapred.JobConf) SnapshotSummary(org.apache.iceberg.SnapshotSummary) JobContext(org.apache.hadoop.mapred.JobContext) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) Preconditions(org.apache.iceberg.relocated.com.google.common.base.Preconditions) JobContextImpl(org.apache.hadoop.mapred.JobContextImpl) HiveAuthorizationProvider(org.apache.hadoop.hive.ql.security.authorization.HiveAuthorizationProvider) SerializableTable(org.apache.iceberg.SerializableTable) VisibleForTesting(org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting) Table(org.apache.iceberg.Table) SerializableTable(org.apache.iceberg.SerializableTable) ArrayList(java.util.ArrayList) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) PartitionTransformSpec(org.apache.hadoop.hive.ql.parse.PartitionTransformSpec)

Example 68 with Table

use of org.apache.iceberg.Table in project hive by apache.

the class HiveIcebergStorageHandler method table.

/**
 * Returns the Table serialized to the configuration based on the table name.
 * If configuration is missing from the FileIO of the table, it will be populated with the input config.
 *
 * @param config The configuration used to get the data from
 * @param name The name of the table we need as returned by TableDesc.getTableName()
 * @return The Table
 */
public static Table table(Configuration config, String name) {
    Table table = SerializationUtil.deserializeFromBase64(config.get(InputFormatConfig.SERIALIZED_TABLE_PREFIX + name));
    checkAndSetIoConfig(config, table);
    return table;
}
Also used : Table(org.apache.iceberg.Table) SerializableTable(org.apache.iceberg.SerializableTable)

Example 69 with Table

use of org.apache.iceberg.Table in project hive by apache.

the class HiveIcebergStorageHandler method getBasicStatistics.

@Override
public Map<String, String> getBasicStatistics(Partish partish) {
    org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
    TableDesc tableDesc = Utilities.getTableDesc(hmsTable);
    Table table = Catalogs.loadTable(conf, tableDesc.getProperties());
    Map<String, String> stats = new HashMap<>();
    if (table.currentSnapshot() != null) {
        Map<String, String> summary = table.currentSnapshot().summary();
        if (summary != null) {
            if (summary.containsKey(SnapshotSummary.TOTAL_DATA_FILES_PROP)) {
                stats.put(StatsSetupConst.NUM_FILES, summary.get(SnapshotSummary.TOTAL_DATA_FILES_PROP));
            }
            if (summary.containsKey(SnapshotSummary.TOTAL_RECORDS_PROP)) {
                stats.put(StatsSetupConst.ROW_COUNT, summary.get(SnapshotSummary.TOTAL_RECORDS_PROP));
            }
            if (summary.containsKey(SnapshotSummary.TOTAL_FILE_SIZE_PROP)) {
                stats.put(StatsSetupConst.TOTAL_SIZE, summary.get(SnapshotSummary.TOTAL_FILE_SIZE_PROP));
            }
        }
    } else {
        stats.put(StatsSetupConst.NUM_FILES, "0");
        stats.put(StatsSetupConst.ROW_COUNT, "0");
        stats.put(StatsSetupConst.TOTAL_SIZE, "0");
    }
    return stats;
}
Also used : Table(org.apache.iceberg.Table) SerializableTable(org.apache.iceberg.SerializableTable) HashMap(java.util.HashMap) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 70 with Table

use of org.apache.iceberg.Table in project hive by apache.

the class TestInputFormatReaderDeletes method createTable.

@Override
protected Table createTable(String name, Schema schema, PartitionSpec spec) throws IOException {
    Table table;
    File location = temp.newFolder(inputFormat, fileFormat.name());
    Assert.assertTrue(location.delete());
    helper = new TestHelper(conf, tables, location.toString(), schema, spec, fileFormat, temp);
    table = helper.createTable();
    TableOperations ops = ((BaseTable) table).operations();
    TableMetadata meta = ops.current();
    ops.commit(meta, meta.upgradeToFormatVersion(2));
    return table;
}
Also used : TableMetadata(org.apache.iceberg.TableMetadata) BaseTable(org.apache.iceberg.BaseTable) Table(org.apache.iceberg.Table) TableOperations(org.apache.iceberg.TableOperations) BaseTable(org.apache.iceberg.BaseTable) File(java.io.File)

Aggregations

Table (org.apache.iceberg.Table)188 Test (org.junit.Test)132 Schema (org.apache.iceberg.Schema)66 TableIdentifier (org.apache.iceberg.catalog.TableIdentifier)56 Record (org.apache.iceberg.data.Record)56 PartitionSpec (org.apache.iceberg.PartitionSpec)51 IOException (java.io.IOException)27 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)27 List (java.util.List)22 Map (java.util.Map)20 DataFile (org.apache.iceberg.DataFile)19 NoSuchTableException (org.apache.iceberg.exceptions.NoSuchTableException)19 Collectors (java.util.stream.Collectors)18 BaseTable (org.apache.iceberg.BaseTable)18 Types (org.apache.iceberg.types.Types)18 Properties (java.util.Properties)17 Configuration (org.apache.hadoop.conf.Configuration)17 Path (org.apache.hadoop.fs.Path)17 FileFormat (org.apache.iceberg.FileFormat)16 ArrayList (java.util.ArrayList)15